464 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ |
469 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ |
465 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ |
470 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ |
466 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ |
471 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ |
467 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ |
472 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ |
468 #define ARC_DONT_L2CACHE (1 << 16) /* originated by prefetch */ |
473 #define ARC_DONT_L2CACHE (1 << 16) /* originated by prefetch */ |
469 #define ARC_L2_READING (1 << 17) /* L2ARC read in progress */ |
474 #define ARC_L2_WRITING (1 << 17) /* L2ARC write in progress */ |
470 #define ARC_L2_WRITING (1 << 18) /* L2ARC write in progress */ |
475 #define ARC_L2_EVICTED (1 << 18) /* evicted during I/O */ |
471 #define ARC_L2_EVICTED (1 << 19) /* evicted during I/O */ |
476 #define ARC_L2_WRITE_HEAD (1 << 19) /* head of write list */ |
472 #define ARC_L2_WRITE_HEAD (1 << 20) /* head of write list */ |
|
473 |
477 |
474 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) |
478 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) |
475 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) |
479 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) |
476 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) |
480 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) |
477 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) |
481 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) |
478 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) |
482 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) |
479 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) |
483 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) |
480 #define HDR_DONT_L2CACHE(hdr) ((hdr)->b_flags & ARC_DONT_L2CACHE) |
484 #define HDR_DONT_L2CACHE(hdr) ((hdr)->b_flags & ARC_DONT_L2CACHE) |
481 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_L2_READING) |
485 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ |
|
486 (hdr)->b_l2hdr != NULL) |
482 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) |
487 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) |
483 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) |
488 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) |
484 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) |
489 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) |
485 |
490 |
486 /* |
491 /* |
525 * Level 2 ARC |
530 * Level 2 ARC |
526 */ |
531 */ |
527 |
532 |
528 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ |
533 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ |
529 #define L2ARC_HEADROOM 4 /* num of writes */ |
534 #define L2ARC_HEADROOM 4 /* num of writes */ |
530 #define L2ARC_FEED_DELAY 180 /* starting grace */ |
|
531 #define L2ARC_FEED_SECS 1 /* caching interval */ |
535 #define L2ARC_FEED_SECS 1 /* caching interval */ |
532 |
536 |
533 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) |
537 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) |
534 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) |
538 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) |
535 |
539 |
536 /* |
540 /* |
537 * L2ARC Performance Tunables |
541 * L2ARC Performance Tunables |
538 */ |
542 */ |
539 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ |
543 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ |
|
544 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ |
540 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ |
545 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ |
541 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ |
546 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ |
542 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ |
547 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ |
543 |
548 |
544 /* |
549 /* |
547 typedef struct l2arc_dev { |
552 typedef struct l2arc_dev { |
548 vdev_t *l2ad_vdev; /* vdev */ |
553 vdev_t *l2ad_vdev; /* vdev */ |
549 spa_t *l2ad_spa; /* spa */ |
554 spa_t *l2ad_spa; /* spa */ |
550 uint64_t l2ad_hand; /* next write location */ |
555 uint64_t l2ad_hand; /* next write location */ |
551 uint64_t l2ad_write; /* desired write size, bytes */ |
556 uint64_t l2ad_write; /* desired write size, bytes */ |
|
557 uint64_t l2ad_boost; /* warmup write boost, bytes */ |
552 uint64_t l2ad_start; /* first addr on device */ |
558 uint64_t l2ad_start; /* first addr on device */ |
553 uint64_t l2ad_end; /* last addr on device */ |
559 uint64_t l2ad_end; /* last addr on device */ |
554 uint64_t l2ad_evict; /* last addr eviction reached */ |
560 uint64_t l2ad_evict; /* last addr eviction reached */ |
555 boolean_t l2ad_first; /* first sweep through */ |
561 boolean_t l2ad_first; /* first sweep through */ |
556 list_t *l2ad_buflist; /* buffer list */ |
562 list_t *l2ad_buflist; /* buffer list */ |
2280 |
2287 |
2281 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || |
2288 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || |
2282 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || |
2289 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || |
2283 (found == hdr && HDR_L2_READING(hdr))); |
2290 (found == hdr && HDR_L2_READING(hdr))); |
2284 |
2291 |
2285 hdr->b_flags &= ~(ARC_L2_READING|ARC_L2_EVICTED); |
2292 hdr->b_flags &= ~ARC_L2_EVICTED; |
2286 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) |
2293 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) |
2287 hdr->b_flags |= ARC_DONT_L2CACHE; |
2294 hdr->b_flags |= ARC_DONT_L2CACHE; |
2288 |
2295 |
2289 /* byteswap if necessary */ |
2296 /* byteswap if necessary */ |
2290 callback_list = hdr->b_acb; |
2297 callback_list = hdr->b_acb; |
2542 */ |
2551 */ |
2543 |
2552 |
2544 if (GHOST_STATE(hdr->b_state)) |
2553 if (GHOST_STATE(hdr->b_state)) |
2545 arc_access(hdr, hash_lock); |
2554 arc_access(hdr, hash_lock); |
2546 |
2555 |
|
2556 if (hdr->b_l2hdr != NULL) { |
|
2557 vd = hdr->b_l2hdr->b_dev->l2ad_vdev; |
|
2558 addr = hdr->b_l2hdr->b_daddr; |
|
2559 } |
|
2560 |
|
2561 mutex_exit(hash_lock); |
|
2562 |
2547 ASSERT3U(hdr->b_size, ==, size); |
2563 ASSERT3U(hdr->b_size, ==, size); |
2548 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, |
2564 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, |
2549 zbookmark_t *, zb); |
2565 zbookmark_t *, zb); |
2550 ARCSTAT_BUMP(arcstat_misses); |
2566 ARCSTAT_BUMP(arcstat_misses); |
2551 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), |
2567 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), |
2552 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, |
2568 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, |
2553 data, metadata, misses); |
2569 data, metadata, misses); |
2554 |
2570 |
2555 if (l2arc_ndev != 0) { |
2571 if (l2arc_ndev != 0) { |
2556 /* |
2572 /* |
|
2573 * Lock out device removal. |
|
2574 */ |
|
2575 spa_config_enter(spa, RW_READER, FTAG); |
|
2576 |
|
2577 /* |
2557 * Read from the L2ARC if the following are true: |
2578 * Read from the L2ARC if the following are true: |
2558 * 1. This buffer has L2ARC metadata. |
2579 * 1. The L2ARC vdev was previously cached. |
2559 * 2. This buffer isn't currently writing to the L2ARC. |
2580 * 2. This buffer still has L2ARC metadata. |
|
2581 * 3. This buffer isn't currently writing to the L2ARC. |
|
2582 * 4. The L2ARC entry wasn't evicted, which may |
|
2583 * also have invalidated the vdev. |
2560 */ |
2584 */ |
2561 if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr)) { |
2585 if (vd != NULL && hdr->b_l2hdr != NULL && |
2562 vdev_t *vd = hdr->b_l2hdr->b_dev->l2ad_vdev; |
2586 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { |
2563 daddr_t addr = hdr->b_l2hdr->b_daddr; |
|
2564 l2arc_read_callback_t *cb; |
2587 l2arc_read_callback_t *cb; |
2565 |
2588 |
2566 if (vdev_is_dead(vd)) |
2589 if (vdev_is_dead(vd)) |
2567 goto skip_l2arc; |
2590 goto l2skip; |
2568 |
|
2569 hdr->b_flags |= ARC_L2_READING; |
|
2570 mutex_exit(hash_lock); |
|
2571 |
2591 |
2572 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); |
2592 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); |
2573 ARCSTAT_BUMP(arcstat_l2_hits); |
2593 ARCSTAT_BUMP(arcstat_l2_hits); |
2574 |
2594 |
2575 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), |
2595 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), |
2583 /* |
2603 /* |
2584 * l2arc read. |
2604 * l2arc read. |
2585 */ |
2605 */ |
2586 rzio = zio_read_phys(pio, vd, addr, size, |
2606 rzio = zio_read_phys(pio, vd, addr, size, |
2587 buf->b_data, ZIO_CHECKSUM_OFF, |
2607 buf->b_data, ZIO_CHECKSUM_OFF, |
2588 l2arc_read_done, cb, priority, |
2608 l2arc_read_done, cb, priority, flags | |
2589 flags | ZIO_FLAG_DONT_CACHE, B_FALSE); |
2609 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL, |
|
2610 B_FALSE); |
2590 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, |
2611 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, |
2591 zio_t *, rzio); |
2612 zio_t *, rzio); |
2592 |
2613 spa_config_exit(spa, FTAG); |
2593 if (*arc_flags & ARC_WAIT) |
2614 |
2594 return (zio_wait(rzio)); |
2615 if (*arc_flags & ARC_NOWAIT) { |
2595 |
2616 zio_nowait(rzio); |
2596 ASSERT(*arc_flags & ARC_NOWAIT); |
2617 return (0); |
2597 zio_nowait(rzio); |
2618 } |
2598 return (0); |
2619 |
|
2620 ASSERT(*arc_flags & ARC_WAIT); |
|
2621 if (zio_wait(rzio) == 0) |
|
2622 return (0); |
|
2623 |
|
2624 /* l2arc read error; goto zio_read() */ |
2599 } else { |
2625 } else { |
2600 DTRACE_PROBE1(l2arc__miss, |
2626 DTRACE_PROBE1(l2arc__miss, |
2601 arc_buf_hdr_t *, hdr); |
2627 arc_buf_hdr_t *, hdr); |
2602 ARCSTAT_BUMP(arcstat_l2_misses); |
2628 ARCSTAT_BUMP(arcstat_l2_misses); |
2603 if (HDR_L2_WRITING(hdr)) |
2629 if (HDR_L2_WRITING(hdr)) |
2604 ARCSTAT_BUMP(arcstat_l2_rw_clash); |
2630 ARCSTAT_BUMP(arcstat_l2_rw_clash); |
|
2631 l2skip: |
|
2632 spa_config_exit(spa, FTAG); |
2605 } |
2633 } |
2606 } |
2634 } |
2607 |
|
2608 skip_l2arc: |
|
2609 mutex_exit(hash_lock); |
|
2610 |
2635 |
2611 rzio = zio_read(pio, spa, bp, buf->b_data, size, |
2636 rzio = zio_read(pio, spa, bp, buf->b_data, size, |
2612 arc_read_done, buf, priority, flags, zb); |
2637 arc_read_done, buf, priority, flags, zb); |
2613 |
2638 |
2614 if (*arc_flags & ARC_WAIT) |
2639 if (*arc_flags & ARC_WAIT) |
3464 * pressure valve to prevent heavy read workloads from both stalling the ARC |
3490 * pressure valve to prevent heavy read workloads from both stalling the ARC |
3465 * with waits and clogging the L2ARC with writes. This also helps prevent |
3491 * with waits and clogging the L2ARC with writes. This also helps prevent |
3466 * the potential for the L2ARC to churn if it attempts to cache content too |
3492 * the potential for the L2ARC to churn if it attempts to cache content too |
3467 * quickly, such as during backups of the entire pool. |
3493 * quickly, such as during backups of the entire pool. |
3468 * |
3494 * |
3469 * 5. Writes to the L2ARC devices are grouped and sent in-sequence, so that |
3495 * 5. After system boot and before the ARC has filled main memory, there are |
|
3496 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru |
|
3497 * lists can remain mostly static. Instead of searching from tail of these |
|
3498 * lists as pictured, the l2arc_feed_thread() will search from the list heads |
|
3499 * for eligible buffers, greatly increasing its chance of finding them. |
|
3500 * |
|
3501 * The L2ARC device write speed is also boosted during this time so that |
|
3502 * the L2ARC warms up faster. Since there have been no ARC evictions yet, |
|
3503 * there are no L2ARC reads, and no fear of degrading read performance |
|
3504 * through increased writes. |
|
3505 * |
|
3506 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that |
3470 * the vdev queue can aggregate them into larger and fewer writes. Each |
3507 * the vdev queue can aggregate them into larger and fewer writes. Each |
3471 * device is written to in a rotor fashion, sweeping writes through |
3508 * device is written to in a rotor fashion, sweeping writes through |
3472 * available space then repeating. |
3509 * available space then repeating. |
3473 * |
3510 * |
3474 * 6. The L2ARC does not store dirty content. It never needs to flush |
3511 * 7. The L2ARC does not store dirty content. It never needs to flush |
3475 * write buffers back to disk based storage. |
3512 * write buffers back to disk based storage. |
3476 * |
3513 * |
3477 * 7. If an ARC buffer is written (and dirtied) which also exists in the |
3514 * 8. If an ARC buffer is written (and dirtied) which also exists in the |
3478 * L2ARC, the now stale L2ARC buffer is immediately dropped. |
3515 * L2ARC, the now stale L2ARC buffer is immediately dropped. |
3479 * |
3516 * |
3480 * The performance of the L2ARC can be tweaked by a number of tunables, which |
3517 * The performance of the L2ARC can be tweaked by a number of tunables, which |
3481 * may be necessary for different workloads: |
3518 * may be necessary for different workloads: |
3482 * |
3519 * |
3483 * l2arc_write_max max write bytes per interval |
3520 * l2arc_write_max max write bytes per interval |
|
3521 * l2arc_write_boost extra write bytes during device warmup |
3484 * l2arc_noprefetch skip caching prefetched buffers |
3522 * l2arc_noprefetch skip caching prefetched buffers |
3485 * l2arc_headroom number of max device writes to precache |
3523 * l2arc_headroom number of max device writes to precache |
3486 * l2arc_feed_secs seconds between L2ARC writing |
3524 * l2arc_feed_secs seconds between L2ARC writing |
3487 * |
3525 * |
3488 * Tunables may be removed or added as future performance improvements are |
3526 * Tunables may be removed or added as future performance improvements are |
3503 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); |
3541 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); |
3504 } |
3542 } |
3505 |
3543 |
3506 /* |
3544 /* |
3507 * Cycle through L2ARC devices. This is how L2ARC load balances. |
3545 * Cycle through L2ARC devices. This is how L2ARC load balances. |
3508 * This is called with l2arc_dev_mtx held, which also locks out spa removal. |
3546 * If a device is returned, this also returns holding the spa config lock. |
3509 */ |
3547 */ |
3510 static l2arc_dev_t * |
3548 static l2arc_dev_t * |
3511 l2arc_dev_get_next(void) |
3549 l2arc_dev_get_next(void) |
3512 { |
3550 { |
3513 l2arc_dev_t *next, *first; |
3551 l2arc_dev_t *first, *next = NULL; |
|
3552 |
|
3553 /* |
|
3554 * Lock out the removal of spas (spa_namespace_lock), then removal |
|
3555 * of cache devices (l2arc_dev_mtx). Once a device has been selected, |
|
3556 * both locks will be dropped and a spa config lock held instead. |
|
3557 */ |
|
3558 mutex_enter(&spa_namespace_lock); |
|
3559 mutex_enter(&l2arc_dev_mtx); |
3514 |
3560 |
3515 /* if there are no vdevs, there is nothing to do */ |
3561 /* if there are no vdevs, there is nothing to do */ |
3516 if (l2arc_ndev == 0) |
3562 if (l2arc_ndev == 0) |
3517 return (NULL); |
3563 goto out; |
3518 |
3564 |
3519 first = NULL; |
3565 first = NULL; |
3520 next = l2arc_dev_last; |
3566 next = l2arc_dev_last; |
3521 do { |
3567 do { |
3522 /* loop around the list looking for a non-faulted vdev */ |
3568 /* loop around the list looking for a non-faulted vdev */ |
3536 |
3582 |
3537 } while (vdev_is_dead(next->l2ad_vdev)); |
3583 } while (vdev_is_dead(next->l2ad_vdev)); |
3538 |
3584 |
3539 /* if we were unable to find any usable vdevs, return NULL */ |
3585 /* if we were unable to find any usable vdevs, return NULL */ |
3540 if (vdev_is_dead(next->l2ad_vdev)) |
3586 if (vdev_is_dead(next->l2ad_vdev)) |
3541 return (NULL); |
3587 next = NULL; |
3542 |
3588 |
3543 l2arc_dev_last = next; |
3589 l2arc_dev_last = next; |
3544 |
3590 |
|
3591 out: |
|
3592 mutex_exit(&l2arc_dev_mtx); |
|
3593 |
|
3594 /* |
|
3595 * Grab the config lock to prevent the 'next' device from being |
|
3596 * removed while we are writing to it. |
|
3597 */ |
|
3598 if (next != NULL) |
|
3599 spa_config_enter(next->l2ad_spa, RW_READER, next); |
|
3600 mutex_exit(&spa_namespace_lock); |
|
3601 |
3545 return (next); |
3602 return (next); |
|
3603 } |
|
3604 |
|
3605 /* |
|
3606 * Free buffers that were tagged for destruction. |
|
3607 */ |
|
3608 static void |
|
3609 l2arc_do_free_on_write() |
|
3610 { |
|
3611 list_t *buflist; |
|
3612 l2arc_data_free_t *df, *df_prev; |
|
3613 |
|
3614 mutex_enter(&l2arc_free_on_write_mtx); |
|
3615 buflist = l2arc_free_on_write; |
|
3616 |
|
3617 for (df = list_tail(buflist); df; df = df_prev) { |
|
3618 df_prev = list_prev(buflist, df); |
|
3619 ASSERT(df->l2df_data != NULL); |
|
3620 ASSERT(df->l2df_func != NULL); |
|
3621 df->l2df_func(df->l2df_data, df->l2df_size); |
|
3622 list_remove(buflist, df); |
|
3623 kmem_free(df, sizeof (l2arc_data_free_t)); |
|
3624 } |
|
3625 |
|
3626 mutex_exit(&l2arc_free_on_write_mtx); |
3546 } |
3627 } |
3547 |
3628 |
3548 /* |
3629 /* |
3549 * A write to a cache device has completed. Update all headers to allow |
3630 * A write to a cache device has completed. Update all headers to allow |
3550 * reads from these buffers to begin. |
3631 * reads from these buffers to begin. |
3610 atomic_inc_64(&l2arc_writes_done); |
3695 atomic_inc_64(&l2arc_writes_done); |
3611 list_remove(buflist, head); |
3696 list_remove(buflist, head); |
3612 kmem_cache_free(hdr_cache, head); |
3697 kmem_cache_free(hdr_cache, head); |
3613 mutex_exit(&l2arc_buflist_mtx); |
3698 mutex_exit(&l2arc_buflist_mtx); |
3614 |
3699 |
3615 /* |
3700 l2arc_do_free_on_write(); |
3616 * Free buffers that were tagged for destruction. |
|
3617 */ |
|
3618 mutex_enter(&l2arc_free_on_write_mtx); |
|
3619 buflist = l2arc_free_on_write; |
|
3620 for (df = list_tail(buflist); df; df = df_prev) { |
|
3621 df_prev = list_prev(buflist, df); |
|
3622 ASSERT(df->l2df_data != NULL); |
|
3623 ASSERT(df->l2df_func != NULL); |
|
3624 df->l2df_func(df->l2df_data, df->l2df_size); |
|
3625 list_remove(buflist, df); |
|
3626 kmem_free(df, sizeof (l2arc_data_free_t)); |
|
3627 } |
|
3628 mutex_exit(&l2arc_free_on_write_mtx); |
|
3629 |
3701 |
3630 kmem_free(cb, sizeof (l2arc_write_callback_t)); |
3702 kmem_free(cb, sizeof (l2arc_write_callback_t)); |
3631 } |
3703 } |
3632 |
3704 |
3633 /* |
3705 /* |
3666 mutex_exit(hash_lock); |
3738 mutex_exit(hash_lock); |
3667 /* |
3739 /* |
3668 * Buffer didn't survive caching. Increment stats and |
3740 * Buffer didn't survive caching. Increment stats and |
3669 * reissue to the original storage device. |
3741 * reissue to the original storage device. |
3670 */ |
3742 */ |
3671 if (zio->io_error != 0) |
3743 if (zio->io_error != 0) { |
3672 ARCSTAT_BUMP(arcstat_l2_io_error); |
3744 ARCSTAT_BUMP(arcstat_l2_io_error); |
|
3745 } else { |
|
3746 zio->io_error = EIO; |
|
3747 } |
3673 if (!equal) |
3748 if (!equal) |
3674 ARCSTAT_BUMP(arcstat_l2_cksum_bad); |
3749 ARCSTAT_BUMP(arcstat_l2_cksum_bad); |
3675 |
3750 |
3676 zio->io_flags &= ~ZIO_FLAG_DONT_CACHE; |
3751 if (zio->io_waiter == NULL) { |
3677 rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp, |
3752 /* |
3678 buf->b_data, zio->io_size, arc_read_done, buf, |
3753 * Let the resent I/O call arc_read_done() instead. |
3679 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); |
3754 */ |
3680 |
3755 zio->io_done = NULL; |
3681 /* |
3756 zio->io_flags &= ~ZIO_FLAG_DONT_CACHE; |
3682 * Since this is a seperate thread, we can wait on this |
3757 |
3683 * I/O whether there is an io_waiter or not. |
3758 rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp, |
3684 */ |
3759 buf->b_data, zio->io_size, arc_read_done, buf, |
3685 err = zio_wait(rzio); |
3760 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); |
3686 |
3761 |
3687 /* |
3762 (void) zio_nowait(rzio); |
3688 * Let the resent I/O call arc_read_done() instead. |
3763 } |
3689 * io_error is set to the reissued I/O error status. |
|
3690 */ |
|
3691 zio->io_done = NULL; |
|
3692 zio->io_waiter = NULL; |
|
3693 zio->io_error = err; |
|
3694 } |
3764 } |
3695 |
3765 |
3696 kmem_free(cb, sizeof (l2arc_read_callback_t)); |
3766 kmem_free(cb, sizeof (l2arc_read_callback_t)); |
3697 } |
3767 } |
3698 |
3768 |
3875 * |
3943 * |
3876 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid |
3944 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid |
3877 * for reading until they have completed writing. |
3945 * for reading until they have completed writing. |
3878 */ |
3946 */ |
3879 static void |
3947 static void |
3880 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev) |
3948 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) |
3881 { |
3949 { |
3882 arc_buf_hdr_t *ab, *ab_prev, *head; |
3950 arc_buf_hdr_t *ab, *ab_prev, *head; |
3883 l2arc_buf_hdr_t *hdrl2; |
3951 l2arc_buf_hdr_t *hdrl2; |
3884 list_t *list; |
3952 list_t *list; |
3885 uint64_t passed_sz, write_sz, buf_sz; |
3953 uint64_t passed_sz, write_sz, buf_sz, headroom; |
3886 uint64_t target_sz = dev->l2ad_write; |
|
3887 uint64_t headroom = dev->l2ad_write * l2arc_headroom; |
|
3888 void *buf_data; |
3954 void *buf_data; |
3889 kmutex_t *hash_lock, *list_lock; |
3955 kmutex_t *hash_lock, *list_lock; |
3890 boolean_t have_lock, full; |
3956 boolean_t have_lock, full; |
3891 l2arc_write_callback_t *cb; |
3957 l2arc_write_callback_t *cb; |
3892 zio_t *pio, *wzio; |
3958 zio_t *pio, *wzio; |
3893 |
3959 |
3894 ASSERT(MUTEX_HELD(&l2arc_dev_mtx)); |
|
3895 ASSERT(dev->l2ad_vdev != NULL); |
3960 ASSERT(dev->l2ad_vdev != NULL); |
3896 |
3961 |
3897 pio = NULL; |
3962 pio = NULL; |
3898 write_sz = 0; |
3963 write_sz = 0; |
3899 full = B_FALSE; |
3964 full = B_FALSE; |
3906 mutex_enter(&l2arc_buflist_mtx); |
3971 mutex_enter(&l2arc_buflist_mtx); |
3907 for (int try = 0; try <= 3; try++) { |
3972 for (int try = 0; try <= 3; try++) { |
3908 list = l2arc_list_locked(try, &list_lock); |
3973 list = l2arc_list_locked(try, &list_lock); |
3909 passed_sz = 0; |
3974 passed_sz = 0; |
3910 |
3975 |
3911 for (ab = list_tail(list); ab; ab = ab_prev) { |
3976 /* |
3912 ab_prev = list_prev(list, ab); |
3977 * L2ARC fast warmup. |
|
3978 * |
|
3979 * Until the ARC is warm and starts to evict, read from the |
|
3980 * head of the ARC lists rather than the tail. |
|
3981 */ |
|
3982 headroom = target_sz * l2arc_headroom; |
|
3983 if (arc_warm == B_FALSE) |
|
3984 ab = list_head(list); |
|
3985 else |
|
3986 ab = list_tail(list); |
|
3987 |
|
3988 for (; ab; ab = ab_prev) { |
|
3989 if (arc_warm == B_FALSE) |
|
3990 ab_prev = list_next(list, ab); |
|
3991 else |
|
3992 ab_prev = list_prev(list, ab); |
3913 |
3993 |
3914 hash_lock = HDR_LOCK(ab); |
3994 hash_lock = HDR_LOCK(ab); |
3915 have_lock = MUTEX_HELD(hash_lock); |
3995 have_lock = MUTEX_HELD(hash_lock); |
3916 if (!have_lock && !mutex_tryenter(hash_lock)) { |
3996 if (!have_lock && !mutex_tryenter(hash_lock)) { |
3917 /* |
3997 /* |
4030 |
4110 |
4031 /* |
4111 /* |
4032 * Bump device hand to the device start if it is approaching the end. |
4112 * Bump device hand to the device start if it is approaching the end. |
4033 * l2arc_evict() will already have evicted ahead for this case. |
4113 * l2arc_evict() will already have evicted ahead for this case. |
4034 */ |
4114 */ |
4035 if (dev->l2ad_hand >= (dev->l2ad_end - dev->l2ad_write)) { |
4115 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { |
4036 spa_l2cache_space_update(dev->l2ad_vdev, 0, |
4116 spa_l2cache_space_update(dev->l2ad_vdev, 0, |
4037 dev->l2ad_end - dev->l2ad_hand); |
4117 dev->l2ad_end - dev->l2ad_hand); |
4038 dev->l2ad_hand = dev->l2ad_start; |
4118 dev->l2ad_hand = dev->l2ad_start; |
4039 dev->l2ad_evict = dev->l2ad_start; |
4119 dev->l2ad_evict = dev->l2ad_start; |
4040 dev->l2ad_first = B_FALSE; |
4120 dev->l2ad_first = B_FALSE; |
4051 l2arc_feed_thread(void) |
4131 l2arc_feed_thread(void) |
4052 { |
4132 { |
4053 callb_cpr_t cpr; |
4133 callb_cpr_t cpr; |
4054 l2arc_dev_t *dev; |
4134 l2arc_dev_t *dev; |
4055 spa_t *spa; |
4135 spa_t *spa; |
4056 int interval; |
4136 uint64_t size; |
4057 boolean_t startup = B_TRUE; |
|
4058 |
4137 |
4059 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); |
4138 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); |
4060 |
4139 |
4061 mutex_enter(&l2arc_feed_thr_lock); |
4140 mutex_enter(&l2arc_feed_thr_lock); |
4062 |
4141 |
4063 while (l2arc_thread_exit == 0) { |
4142 while (l2arc_thread_exit == 0) { |
4064 /* |
4143 /* |
4065 * Initially pause for L2ARC_FEED_DELAY seconds as a grace |
4144 * Pause for l2arc_feed_secs seconds between writes. |
4066 * interval during boot, followed by l2arc_feed_secs seconds |
|
4067 * thereafter. |
|
4068 */ |
4145 */ |
4069 CALLB_CPR_SAFE_BEGIN(&cpr); |
4146 CALLB_CPR_SAFE_BEGIN(&cpr); |
4070 if (startup) { |
|
4071 interval = L2ARC_FEED_DELAY; |
|
4072 startup = B_FALSE; |
|
4073 } else { |
|
4074 interval = l2arc_feed_secs; |
|
4075 } |
|
4076 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, |
4147 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, |
4077 lbolt + (hz * interval)); |
4148 lbolt + (hz * l2arc_feed_secs)); |
4078 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); |
4149 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); |
4079 |
4150 |
|
4151 /* |
|
4152 * Quick check for L2ARC devices. |
|
4153 */ |
4080 mutex_enter(&l2arc_dev_mtx); |
4154 mutex_enter(&l2arc_dev_mtx); |
|
4155 if (l2arc_ndev == 0) { |
|
4156 mutex_exit(&l2arc_dev_mtx); |
|
4157 continue; |
|
4158 } |
|
4159 mutex_exit(&l2arc_dev_mtx); |
4081 |
4160 |
4082 /* |
4161 /* |
4083 * This selects the next l2arc device to write to, and in |
4162 * This selects the next l2arc device to write to, and in |
4084 * doing so the next spa to feed from: dev->l2ad_spa. This |
4163 * doing so the next spa to feed from: dev->l2ad_spa. This |
4085 * will return NULL if there are no l2arc devices or if they |
4164 * will return NULL if there are now no l2arc devices or if |
4086 * are all faulted. |
4165 * they are all faulted. |
|
4166 * |
|
4167 * If a device is returned, its spa's config lock is also |
|
4168 * held to prevent device removal. l2arc_dev_get_next() |
|
4169 * will grab and release l2arc_dev_mtx. |
4087 */ |
4170 */ |
4088 if ((dev = l2arc_dev_get_next()) == NULL) { |
4171 if ((dev = l2arc_dev_get_next()) == NULL) |
4089 mutex_exit(&l2arc_dev_mtx); |
|
4090 continue; |
4172 continue; |
4091 } |
4173 |
|
4174 spa = dev->l2ad_spa; |
|
4175 ASSERT(spa != NULL); |
4092 |
4176 |
4093 /* |
4177 /* |
4094 * Avoid contributing to memory pressure. |
4178 * Avoid contributing to memory pressure. |
4095 */ |
4179 */ |
4096 if (arc_reclaim_needed()) { |
4180 if (arc_reclaim_needed()) { |
4097 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); |
4181 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); |
4098 mutex_exit(&l2arc_dev_mtx); |
4182 spa_config_exit(spa, dev); |
4099 continue; |
4183 continue; |
4100 } |
4184 } |
4101 |
4185 |
4102 spa = dev->l2ad_spa; |
|
4103 ASSERT(spa != NULL); |
|
4104 ARCSTAT_BUMP(arcstat_l2_feeds); |
4186 ARCSTAT_BUMP(arcstat_l2_feeds); |
|
4187 |
|
4188 size = dev->l2ad_write; |
|
4189 if (arc_warm == B_FALSE) |
|
4190 size += dev->l2ad_boost; |
4105 |
4191 |
4106 /* |
4192 /* |
4107 * Evict L2ARC buffers that will be overwritten. |
4193 * Evict L2ARC buffers that will be overwritten. |
4108 */ |
4194 */ |
4109 l2arc_evict(dev, dev->l2ad_write, B_FALSE); |
4195 l2arc_evict(dev, size, B_FALSE); |
4110 |
4196 |
4111 /* |
4197 /* |
4112 * Write ARC buffers. |
4198 * Write ARC buffers. |
4113 */ |
4199 */ |
4114 l2arc_write_buffers(spa, dev); |
4200 l2arc_write_buffers(spa, dev, size); |
4115 mutex_exit(&l2arc_dev_mtx); |
4201 spa_config_exit(spa, dev); |
4116 } |
4202 } |
4117 |
4203 |
4118 l2arc_thread_exit = 0; |
4204 l2arc_thread_exit = 0; |
4119 cv_broadcast(&l2arc_feed_thr_cv); |
4205 cv_broadcast(&l2arc_feed_thr_cv); |
4120 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ |
4206 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ |
4153 */ |
4239 */ |
4154 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); |
4240 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); |
4155 adddev->l2ad_spa = spa; |
4241 adddev->l2ad_spa = spa; |
4156 adddev->l2ad_vdev = vd; |
4242 adddev->l2ad_vdev = vd; |
4157 adddev->l2ad_write = l2arc_write_max; |
4243 adddev->l2ad_write = l2arc_write_max; |
|
4244 adddev->l2ad_boost = l2arc_write_boost; |
4158 adddev->l2ad_start = start; |
4245 adddev->l2ad_start = start; |
4159 adddev->l2ad_end = end; |
4246 adddev->l2ad_end = end; |
4160 adddev->l2ad_hand = adddev->l2ad_start; |
4247 adddev->l2ad_hand = adddev->l2ad_start; |
4161 adddev->l2ad_evict = adddev->l2ad_start; |
4248 adddev->l2ad_evict = adddev->l2ad_start; |
4162 adddev->l2ad_first = B_TRUE; |
4249 adddev->l2ad_first = B_TRUE; |
4211 /* |
4292 /* |
4212 * Remove device from global list |
4293 * Remove device from global list |
4213 */ |
4294 */ |
4214 list_remove(l2arc_dev_list, remdev); |
4295 list_remove(l2arc_dev_list, remdev); |
4215 l2arc_dev_last = NULL; /* may have been invalidated */ |
4296 l2arc_dev_last = NULL; /* may have been invalidated */ |
|
4297 atomic_dec_64(&l2arc_ndev); |
|
4298 mutex_exit(&l2arc_dev_mtx); |
4216 |
4299 |
4217 /* |
4300 /* |
4218 * Clear all buflists and ARC references. L2ARC device flush. |
4301 * Clear all buflists and ARC references. L2ARC device flush. |
4219 */ |
4302 */ |
4220 l2arc_evict(remdev, 0, B_TRUE); |
4303 l2arc_evict(remdev, 0, B_TRUE); |
4221 list_destroy(remdev->l2ad_buflist); |
4304 list_destroy(remdev->l2ad_buflist); |
4222 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); |
4305 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); |
4223 kmem_free(remdev, sizeof (l2arc_dev_t)); |
4306 kmem_free(remdev, sizeof (l2arc_dev_t)); |
4224 |
|
4225 atomic_dec_64(&l2arc_ndev); |
|
4226 mutex_exit(&l2arc_dev_mtx); |
|
4227 } |
4307 } |
4228 |
4308 |
4229 void |
4309 void |
4230 l2arc_init() |
4310 l2arc_init() |
4231 { |
4311 { |
4252 } |
4332 } |
4253 |
4333 |
4254 void |
4334 void |
4255 l2arc_fini() |
4335 l2arc_fini() |
4256 { |
4336 { |
|
4337 /* |
|
4338 * This is called from dmu_fini(), which is called from spa_fini(); |
|
4339 * Because of this, we can assume that all l2arc devices have |
|
4340 * already been removed when the pools themselves were removed. |
|
4341 */ |
|
4342 |
4257 mutex_enter(&l2arc_feed_thr_lock); |
4343 mutex_enter(&l2arc_feed_thr_lock); |
4258 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ |
4344 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ |
4259 l2arc_thread_exit = 1; |
4345 l2arc_thread_exit = 1; |
4260 while (l2arc_thread_exit != 0) |
4346 while (l2arc_thread_exit != 0) |
4261 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); |
4347 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); |
4262 mutex_exit(&l2arc_feed_thr_lock); |
4348 mutex_exit(&l2arc_feed_thr_lock); |
4263 |
4349 |
|
4350 l2arc_do_free_on_write(); |
|
4351 |
4264 mutex_destroy(&l2arc_feed_thr_lock); |
4352 mutex_destroy(&l2arc_feed_thr_lock); |
4265 cv_destroy(&l2arc_feed_thr_cv); |
4353 cv_destroy(&l2arc_feed_thr_cv); |
4266 mutex_destroy(&l2arc_dev_mtx); |
4354 mutex_destroy(&l2arc_dev_mtx); |
4267 mutex_destroy(&l2arc_buflist_mtx); |
4355 mutex_destroy(&l2arc_buflist_mtx); |
4268 mutex_destroy(&l2arc_free_on_write_mtx); |
4356 mutex_destroy(&l2arc_free_on_write_mtx); |