usr/src/uts/common/fs/zfs/arc.c
changeset 6987 877c018eb06c
parent 6643 3a34b0dbb107
child 7046 361307ae060d
equal deleted inserted replaced
6986:1e7638a44ce6 6987:877c018eb06c
   159  * (initialized in arc_init())
   159  * (initialized in arc_init())
   160  */
   160  */
   161 static int		arc_min_prefetch_lifespan;
   161 static int		arc_min_prefetch_lifespan;
   162 
   162 
   163 static int arc_dead;
   163 static int arc_dead;
       
   164 
       
   165 /*
       
   166  * The arc has filled available memory and has now warmed up.
       
   167  */
       
   168 static boolean_t arc_warm;
   164 
   169 
   165 /*
   170 /*
   166  * These tunables are for performance analysis.
   171  * These tunables are for performance analysis.
   167  */
   172  */
   168 uint64_t zfs_arc_max;
   173 uint64_t zfs_arc_max;
   464 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
   469 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
   465 #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
   470 #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
   466 #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
   471 #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
   467 #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
   472 #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
   468 #define	ARC_DONT_L2CACHE	(1 << 16)	/* originated by prefetch */
   473 #define	ARC_DONT_L2CACHE	(1 << 16)	/* originated by prefetch */
   469 #define	ARC_L2_READING		(1 << 17)	/* L2ARC read in progress */
   474 #define	ARC_L2_WRITING		(1 << 17)	/* L2ARC write in progress */
   470 #define	ARC_L2_WRITING		(1 << 18)	/* L2ARC write in progress */
   475 #define	ARC_L2_EVICTED		(1 << 18)	/* evicted during I/O */
   471 #define	ARC_L2_EVICTED		(1 << 19)	/* evicted during I/O */
   476 #define	ARC_L2_WRITE_HEAD	(1 << 19)	/* head of write list */
   472 #define	ARC_L2_WRITE_HEAD	(1 << 20)	/* head of write list */
       
   473 
   477 
   474 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
   478 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
   475 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
   479 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
   476 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
   480 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
   477 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
   481 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
   478 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
   482 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
   479 #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
   483 #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
   480 #define	HDR_DONT_L2CACHE(hdr)	((hdr)->b_flags & ARC_DONT_L2CACHE)
   484 #define	HDR_DONT_L2CACHE(hdr)	((hdr)->b_flags & ARC_DONT_L2CACHE)
   481 #define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_L2_READING)
   485 #define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
       
   486 				    (hdr)->b_l2hdr != NULL)
   482 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
   487 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
   483 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
   488 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
   484 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
   489 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
   485 
   490 
   486 /*
   491 /*
   525  * Level 2 ARC
   530  * Level 2 ARC
   526  */
   531  */
   527 
   532 
   528 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
   533 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
   529 #define	L2ARC_HEADROOM		4		/* num of writes */
   534 #define	L2ARC_HEADROOM		4		/* num of writes */
   530 #define	L2ARC_FEED_DELAY	180		/* starting grace */
       
   531 #define	L2ARC_FEED_SECS		1		/* caching interval */
   535 #define	L2ARC_FEED_SECS		1		/* caching interval */
   532 
   536 
   533 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
   537 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
   534 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
   538 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
   535 
   539 
   536 /*
   540 /*
   537  * L2ARC Performance Tunables
   541  * L2ARC Performance Tunables
   538  */
   542  */
   539 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
   543 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
       
   544 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
   540 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
   545 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
   541 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
   546 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
   542 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
   547 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
   543 
   548 
   544 /*
   549 /*
   547 typedef struct l2arc_dev {
   552 typedef struct l2arc_dev {
   548 	vdev_t			*l2ad_vdev;	/* vdev */
   553 	vdev_t			*l2ad_vdev;	/* vdev */
   549 	spa_t			*l2ad_spa;	/* spa */
   554 	spa_t			*l2ad_spa;	/* spa */
   550 	uint64_t		l2ad_hand;	/* next write location */
   555 	uint64_t		l2ad_hand;	/* next write location */
   551 	uint64_t		l2ad_write;	/* desired write size, bytes */
   556 	uint64_t		l2ad_write;	/* desired write size, bytes */
       
   557 	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
   552 	uint64_t		l2ad_start;	/* first addr on device */
   558 	uint64_t		l2ad_start;	/* first addr on device */
   553 	uint64_t		l2ad_end;	/* last addr on device */
   559 	uint64_t		l2ad_end;	/* last addr on device */
   554 	uint64_t		l2ad_evict;	/* last addr eviction reached */
   560 	uint64_t		l2ad_evict;	/* last addr eviction reached */
   555 	boolean_t		l2ad_first;	/* first sweep through */
   561 	boolean_t		l2ad_first;	/* first sweep through */
   556 	list_t			*l2ad_buflist;	/* buffer list */
   562 	list_t			*l2ad_buflist;	/* buffer list */
  1883 
  1889 
  1884 			/* reset the growth delay for every reclaim */
  1890 			/* reset the growth delay for every reclaim */
  1885 			growtime = lbolt + (arc_grow_retry * hz);
  1891 			growtime = lbolt + (arc_grow_retry * hz);
  1886 
  1892 
  1887 			arc_kmem_reap_now(last_reclaim);
  1893 			arc_kmem_reap_now(last_reclaim);
       
  1894 			arc_warm = B_TRUE;
  1888 
  1895 
  1889 		} else if (arc_no_grow && lbolt >= growtime) {
  1896 		} else if (arc_no_grow && lbolt >= growtime) {
  1890 			arc_no_grow = FALSE;
  1897 			arc_no_grow = FALSE;
  1891 		}
  1898 		}
  1892 
  1899 
  2280 
  2287 
  2281 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
  2288 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
  2282 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
  2289 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
  2283 	    (found == hdr && HDR_L2_READING(hdr)));
  2290 	    (found == hdr && HDR_L2_READING(hdr)));
  2284 
  2291 
  2285 	hdr->b_flags &= ~(ARC_L2_READING|ARC_L2_EVICTED);
  2292 	hdr->b_flags &= ~ARC_L2_EVICTED;
  2286 	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
  2293 	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
  2287 		hdr->b_flags |= ARC_DONT_L2CACHE;
  2294 		hdr->b_flags |= ARC_DONT_L2CACHE;
  2288 
  2295 
  2289 	/* byteswap if necessary */
  2296 	/* byteswap if necessary */
  2290 	callback_list = hdr->b_acb;
  2297 	callback_list = hdr->b_acb;
  2469 		if (done)
  2476 		if (done)
  2470 			done(NULL, buf, private);
  2477 			done(NULL, buf, private);
  2471 	} else {
  2478 	} else {
  2472 		uint64_t size = BP_GET_LSIZE(bp);
  2479 		uint64_t size = BP_GET_LSIZE(bp);
  2473 		arc_callback_t	*acb;
  2480 		arc_callback_t	*acb;
       
  2481 		vdev_t *vd = NULL;
       
  2482 		daddr_t addr;
  2474 
  2483 
  2475 		if (hdr == NULL) {
  2484 		if (hdr == NULL) {
  2476 			/* this block is not in the cache */
  2485 			/* this block is not in the cache */
  2477 			arc_buf_hdr_t	*exists;
  2486 			arc_buf_hdr_t	*exists;
  2478 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
  2487 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
  2542 		 */
  2551 		 */
  2543 
  2552 
  2544 		if (GHOST_STATE(hdr->b_state))
  2553 		if (GHOST_STATE(hdr->b_state))
  2545 			arc_access(hdr, hash_lock);
  2554 			arc_access(hdr, hash_lock);
  2546 
  2555 
       
  2556 		if (hdr->b_l2hdr != NULL) {
       
  2557 			vd = hdr->b_l2hdr->b_dev->l2ad_vdev;
       
  2558 			addr = hdr->b_l2hdr->b_daddr;
       
  2559 		}
       
  2560 
       
  2561 		mutex_exit(hash_lock);
       
  2562 
  2547 		ASSERT3U(hdr->b_size, ==, size);
  2563 		ASSERT3U(hdr->b_size, ==, size);
  2548 		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
  2564 		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
  2549 		    zbookmark_t *, zb);
  2565 		    zbookmark_t *, zb);
  2550 		ARCSTAT_BUMP(arcstat_misses);
  2566 		ARCSTAT_BUMP(arcstat_misses);
  2551 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
  2567 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
  2552 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
  2568 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
  2553 		    data, metadata, misses);
  2569 		    data, metadata, misses);
  2554 
  2570 
  2555 		if (l2arc_ndev != 0) {
  2571 		if (l2arc_ndev != 0) {
  2556 			/*
  2572 			/*
       
  2573 			 * Lock out device removal.
       
  2574 			 */
       
  2575 			spa_config_enter(spa, RW_READER, FTAG);
       
  2576 
       
  2577 			/*
  2557 			 * Read from the L2ARC if the following are true:
  2578 			 * Read from the L2ARC if the following are true:
  2558 			 * 1. This buffer has L2ARC metadata.
  2579 			 * 1. The L2ARC vdev was previously cached.
  2559 			 * 2. This buffer isn't currently writing to the L2ARC.
  2580 			 * 2. This buffer still has L2ARC metadata.
       
  2581 			 * 3. This buffer isn't currently writing to the L2ARC.
       
  2582 			 * 4. The L2ARC entry wasn't evicted, which may
       
  2583 			 *    also have invalidated the vdev.
  2560 			 */
  2584 			 */
  2561 			if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr)) {
  2585 			if (vd != NULL && hdr->b_l2hdr != NULL &&
  2562 				vdev_t *vd = hdr->b_l2hdr->b_dev->l2ad_vdev;
  2586 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
  2563 				daddr_t addr = hdr->b_l2hdr->b_daddr;
       
  2564 				l2arc_read_callback_t *cb;
  2587 				l2arc_read_callback_t *cb;
  2565 
  2588 
  2566 				if (vdev_is_dead(vd))
  2589 				if (vdev_is_dead(vd))
  2567 					goto skip_l2arc;
  2590 					goto l2skip;
  2568 
       
  2569 				hdr->b_flags |= ARC_L2_READING;
       
  2570 				mutex_exit(hash_lock);
       
  2571 
  2591 
  2572 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
  2592 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
  2573 				ARCSTAT_BUMP(arcstat_l2_hits);
  2593 				ARCSTAT_BUMP(arcstat_l2_hits);
  2574 
  2594 
  2575 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
  2595 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
  2583 				/*
  2603 				/*
  2584 				 * l2arc read.
  2604 				 * l2arc read.
  2585 				 */
  2605 				 */
  2586 				rzio = zio_read_phys(pio, vd, addr, size,
  2606 				rzio = zio_read_phys(pio, vd, addr, size,
  2587 				    buf->b_data, ZIO_CHECKSUM_OFF,
  2607 				    buf->b_data, ZIO_CHECKSUM_OFF,
  2588 				    l2arc_read_done, cb, priority,
  2608 				    l2arc_read_done, cb, priority, flags |
  2589 				    flags | ZIO_FLAG_DONT_CACHE, B_FALSE);
  2609 				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL,
       
  2610 				    B_FALSE);
  2590 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
  2611 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
  2591 				    zio_t *, rzio);
  2612 				    zio_t *, rzio);
  2592 
  2613 				spa_config_exit(spa, FTAG);
  2593 				if (*arc_flags & ARC_WAIT)
  2614 
  2594 					return (zio_wait(rzio));
  2615 				if (*arc_flags & ARC_NOWAIT) {
  2595 
  2616 					zio_nowait(rzio);
  2596 				ASSERT(*arc_flags & ARC_NOWAIT);
  2617 					return (0);
  2597 				zio_nowait(rzio);
  2618 				}
  2598 				return (0);
  2619 
       
  2620 				ASSERT(*arc_flags & ARC_WAIT);
       
  2621 				if (zio_wait(rzio) == 0)
       
  2622 					return (0);
       
  2623 
       
  2624 				/* l2arc read error; goto zio_read() */
  2599 			} else {
  2625 			} else {
  2600 				DTRACE_PROBE1(l2arc__miss,
  2626 				DTRACE_PROBE1(l2arc__miss,
  2601 				    arc_buf_hdr_t *, hdr);
  2627 				    arc_buf_hdr_t *, hdr);
  2602 				ARCSTAT_BUMP(arcstat_l2_misses);
  2628 				ARCSTAT_BUMP(arcstat_l2_misses);
  2603 				if (HDR_L2_WRITING(hdr))
  2629 				if (HDR_L2_WRITING(hdr))
  2604 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
  2630 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
       
  2631 l2skip:
       
  2632 				spa_config_exit(spa, FTAG);
  2605 			}
  2633 			}
  2606 		}
  2634 		}
  2607 
       
  2608 skip_l2arc:
       
  2609 		mutex_exit(hash_lock);
       
  2610 
  2635 
  2611 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
  2636 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
  2612 		    arc_read_done, buf, priority, flags, zb);
  2637 		    arc_read_done, buf, priority, flags, zb);
  2613 
  2638 
  2614 		if (*arc_flags & ARC_WAIT)
  2639 		if (*arc_flags & ARC_WAIT)
  3324 
  3349 
  3325 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
  3350 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
  3326 	    TS_RUN, minclsyspri);
  3351 	    TS_RUN, minclsyspri);
  3327 
  3352 
  3328 	arc_dead = FALSE;
  3353 	arc_dead = FALSE;
       
  3354 	arc_warm = B_FALSE;
  3329 
  3355 
  3330 	if (zfs_write_limit_max == 0)
  3356 	if (zfs_write_limit_max == 0)
  3331 		zfs_write_limit_max = physmem * PAGESIZE >>
  3357 		zfs_write_limit_max = physmem * PAGESIZE >>
  3332 		    zfs_write_limit_shift;
  3358 		    zfs_write_limit_shift;
  3333 	else
  3359 	else
  3464  * pressure valve to prevent heavy read workloads from both stalling the ARC
  3490  * pressure valve to prevent heavy read workloads from both stalling the ARC
  3465  * with waits and clogging the L2ARC with writes.  This also helps prevent
  3491  * with waits and clogging the L2ARC with writes.  This also helps prevent
  3466  * the potential for the L2ARC to churn if it attempts to cache content too
  3492  * the potential for the L2ARC to churn if it attempts to cache content too
  3467  * quickly, such as during backups of the entire pool.
  3493  * quickly, such as during backups of the entire pool.
  3468  *
  3494  *
  3469  * 5. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  3495  * 5. After system boot and before the ARC has filled main memory, there are
       
  3496  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
       
  3497  * lists can remain mostly static.  Instead of searching from tail of these
       
  3498  * lists as pictured, the l2arc_feed_thread() will search from the list heads
       
  3499  * for eligible buffers, greatly increasing its chance of finding them.
       
  3500  *
       
  3501  * The L2ARC device write speed is also boosted during this time so that
       
  3502  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
       
  3503  * there are no L2ARC reads, and no fear of degrading read performance
       
  3504  * through increased writes.
       
  3505  *
       
  3506  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  3470  * the vdev queue can aggregate them into larger and fewer writes.  Each
  3507  * the vdev queue can aggregate them into larger and fewer writes.  Each
  3471  * device is written to in a rotor fashion, sweeping writes through
  3508  * device is written to in a rotor fashion, sweeping writes through
  3472  * available space then repeating.
  3509  * available space then repeating.
  3473  *
  3510  *
  3474  * 6. The L2ARC does not store dirty content.  It never needs to flush
  3511  * 7. The L2ARC does not store dirty content.  It never needs to flush
  3475  * write buffers back to disk based storage.
  3512  * write buffers back to disk based storage.
  3476  *
  3513  *
  3477  * 7. If an ARC buffer is written (and dirtied) which also exists in the
  3514  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  3478  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  3515  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  3479  *
  3516  *
  3480  * The performance of the L2ARC can be tweaked by a number of tunables, which
  3517  * The performance of the L2ARC can be tweaked by a number of tunables, which
  3481  * may be necessary for different workloads:
  3518  * may be necessary for different workloads:
  3482  *
  3519  *
  3483  *	l2arc_write_max		max write bytes per interval
  3520  *	l2arc_write_max		max write bytes per interval
       
  3521  *	l2arc_write_boost	extra write bytes during device warmup
  3484  *	l2arc_noprefetch	skip caching prefetched buffers
  3522  *	l2arc_noprefetch	skip caching prefetched buffers
  3485  *	l2arc_headroom		number of max device writes to precache
  3523  *	l2arc_headroom		number of max device writes to precache
  3486  *	l2arc_feed_secs		seconds between L2ARC writing
  3524  *	l2arc_feed_secs		seconds between L2ARC writing
  3487  *
  3525  *
  3488  * Tunables may be removed or added as future performance improvements are
  3526  * Tunables may be removed or added as future performance improvements are
  3503 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
  3541 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
  3504 }
  3542 }
  3505 
  3543 
  3506 /*
  3544 /*
  3507  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  3545  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  3508  * This is called with l2arc_dev_mtx held, which also locks out spa removal.
  3546  * If a device is returned, this also returns holding the spa config lock.
  3509  */
  3547  */
  3510 static l2arc_dev_t *
  3548 static l2arc_dev_t *
  3511 l2arc_dev_get_next(void)
  3549 l2arc_dev_get_next(void)
  3512 {
  3550 {
  3513 	l2arc_dev_t *next, *first;
  3551 	l2arc_dev_t *first, *next = NULL;
       
  3552 
       
  3553 	/*
       
  3554 	 * Lock out the removal of spas (spa_namespace_lock), then removal
       
  3555 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
       
  3556 	 * both locks will be dropped and a spa config lock held instead.
       
  3557 	 */
       
  3558 	mutex_enter(&spa_namespace_lock);
       
  3559 	mutex_enter(&l2arc_dev_mtx);
  3514 
  3560 
  3515 	/* if there are no vdevs, there is nothing to do */
  3561 	/* if there are no vdevs, there is nothing to do */
  3516 	if (l2arc_ndev == 0)
  3562 	if (l2arc_ndev == 0)
  3517 		return (NULL);
  3563 		goto out;
  3518 
  3564 
  3519 	first = NULL;
  3565 	first = NULL;
  3520 	next = l2arc_dev_last;
  3566 	next = l2arc_dev_last;
  3521 	do {
  3567 	do {
  3522 		/* loop around the list looking for a non-faulted vdev */
  3568 		/* loop around the list looking for a non-faulted vdev */
  3536 
  3582 
  3537 	} while (vdev_is_dead(next->l2ad_vdev));
  3583 	} while (vdev_is_dead(next->l2ad_vdev));
  3538 
  3584 
  3539 	/* if we were unable to find any usable vdevs, return NULL */
  3585 	/* if we were unable to find any usable vdevs, return NULL */
  3540 	if (vdev_is_dead(next->l2ad_vdev))
  3586 	if (vdev_is_dead(next->l2ad_vdev))
  3541 		return (NULL);
  3587 		next = NULL;
  3542 
  3588 
  3543 	l2arc_dev_last = next;
  3589 	l2arc_dev_last = next;
  3544 
  3590 
       
  3591 out:
       
  3592 	mutex_exit(&l2arc_dev_mtx);
       
  3593 
       
  3594 	/*
       
  3595 	 * Grab the config lock to prevent the 'next' device from being
       
  3596 	 * removed while we are writing to it.
       
  3597 	 */
       
  3598 	if (next != NULL)
       
  3599 		spa_config_enter(next->l2ad_spa, RW_READER, next);
       
  3600 	mutex_exit(&spa_namespace_lock);
       
  3601 
  3545 	return (next);
  3602 	return (next);
       
  3603 }
       
  3604 
       
  3605 /*
       
  3606  * Free buffers that were tagged for destruction.
       
  3607  */
       
  3608 static void
       
  3609 l2arc_do_free_on_write()
       
  3610 {
       
  3611 	list_t *buflist;
       
  3612 	l2arc_data_free_t *df, *df_prev;
       
  3613 
       
  3614 	mutex_enter(&l2arc_free_on_write_mtx);
       
  3615 	buflist = l2arc_free_on_write;
       
  3616 
       
  3617 	for (df = list_tail(buflist); df; df = df_prev) {
       
  3618 		df_prev = list_prev(buflist, df);
       
  3619 		ASSERT(df->l2df_data != NULL);
       
  3620 		ASSERT(df->l2df_func != NULL);
       
  3621 		df->l2df_func(df->l2df_data, df->l2df_size);
       
  3622 		list_remove(buflist, df);
       
  3623 		kmem_free(df, sizeof (l2arc_data_free_t));
       
  3624 	}
       
  3625 
       
  3626 	mutex_exit(&l2arc_free_on_write_mtx);
  3546 }
  3627 }
  3547 
  3628 
  3548 /*
  3629 /*
  3549  * A write to a cache device has completed.  Update all headers to allow
  3630  * A write to a cache device has completed.  Update all headers to allow
  3550  * reads from these buffers to begin.
  3631  * reads from these buffers to begin.
  3553 l2arc_write_done(zio_t *zio)
  3634 l2arc_write_done(zio_t *zio)
  3554 {
  3635 {
  3555 	l2arc_write_callback_t *cb;
  3636 	l2arc_write_callback_t *cb;
  3556 	l2arc_dev_t *dev;
  3637 	l2arc_dev_t *dev;
  3557 	list_t *buflist;
  3638 	list_t *buflist;
  3558 	l2arc_data_free_t *df, *df_prev;
       
  3559 	arc_buf_hdr_t *head, *ab, *ab_prev;
  3639 	arc_buf_hdr_t *head, *ab, *ab_prev;
       
  3640 	l2arc_buf_hdr_t *abl2;
  3560 	kmutex_t *hash_lock;
  3641 	kmutex_t *hash_lock;
  3561 
  3642 
  3562 	cb = zio->io_private;
  3643 	cb = zio->io_private;
  3563 	ASSERT(cb != NULL);
  3644 	ASSERT(cb != NULL);
  3564 	dev = cb->l2wcb_dev;
  3645 	dev = cb->l2wcb_dev;
  3592 			continue;
  3673 			continue;
  3593 		}
  3674 		}
  3594 
  3675 
  3595 		if (zio->io_error != 0) {
  3676 		if (zio->io_error != 0) {
  3596 			/*
  3677 			/*
  3597 			 * Error - invalidate L2ARC entry.
  3678 			 * Error - drop L2ARC entry.
  3598 			 */
  3679 			 */
       
  3680 			list_remove(buflist, ab);
       
  3681 			abl2 = ab->b_l2hdr;
  3599 			ab->b_l2hdr = NULL;
  3682 			ab->b_l2hdr = NULL;
       
  3683 			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
       
  3684 			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
  3600 		}
  3685 		}
  3601 
  3686 
  3602 		/*
  3687 		/*
  3603 		 * Allow ARC to begin reads to this L2ARC entry.
  3688 		 * Allow ARC to begin reads to this L2ARC entry.
  3604 		 */
  3689 		 */
  3610 	atomic_inc_64(&l2arc_writes_done);
  3695 	atomic_inc_64(&l2arc_writes_done);
  3611 	list_remove(buflist, head);
  3696 	list_remove(buflist, head);
  3612 	kmem_cache_free(hdr_cache, head);
  3697 	kmem_cache_free(hdr_cache, head);
  3613 	mutex_exit(&l2arc_buflist_mtx);
  3698 	mutex_exit(&l2arc_buflist_mtx);
  3614 
  3699 
  3615 	/*
  3700 	l2arc_do_free_on_write();
  3616 	 * Free buffers that were tagged for destruction.
       
  3617 	 */
       
  3618 	mutex_enter(&l2arc_free_on_write_mtx);
       
  3619 	buflist = l2arc_free_on_write;
       
  3620 	for (df = list_tail(buflist); df; df = df_prev) {
       
  3621 		df_prev = list_prev(buflist, df);
       
  3622 		ASSERT(df->l2df_data != NULL);
       
  3623 		ASSERT(df->l2df_func != NULL);
       
  3624 		df->l2df_func(df->l2df_data, df->l2df_size);
       
  3625 		list_remove(buflist, df);
       
  3626 		kmem_free(df, sizeof (l2arc_data_free_t));
       
  3627 	}
       
  3628 	mutex_exit(&l2arc_free_on_write_mtx);
       
  3629 
  3701 
  3630 	kmem_free(cb, sizeof (l2arc_write_callback_t));
  3702 	kmem_free(cb, sizeof (l2arc_write_callback_t));
  3631 }
  3703 }
  3632 
  3704 
  3633 /*
  3705 /*
  3640 	l2arc_read_callback_t *cb;
  3712 	l2arc_read_callback_t *cb;
  3641 	arc_buf_hdr_t *hdr;
  3713 	arc_buf_hdr_t *hdr;
  3642 	arc_buf_t *buf;
  3714 	arc_buf_t *buf;
  3643 	zio_t *rzio;
  3715 	zio_t *rzio;
  3644 	kmutex_t *hash_lock;
  3716 	kmutex_t *hash_lock;
  3645 	int equal, err = 0;
  3717 	int equal;
  3646 
  3718 
  3647 	cb = zio->io_private;
  3719 	cb = zio->io_private;
  3648 	ASSERT(cb != NULL);
  3720 	ASSERT(cb != NULL);
  3649 	buf = cb->l2rcb_buf;
  3721 	buf = cb->l2rcb_buf;
  3650 	ASSERT(buf != NULL);
  3722 	ASSERT(buf != NULL);
  3666 		mutex_exit(hash_lock);
  3738 		mutex_exit(hash_lock);
  3667 		/*
  3739 		/*
  3668 		 * Buffer didn't survive caching.  Increment stats and
  3740 		 * Buffer didn't survive caching.  Increment stats and
  3669 		 * reissue to the original storage device.
  3741 		 * reissue to the original storage device.
  3670 		 */
  3742 		 */
  3671 		if (zio->io_error != 0)
  3743 		if (zio->io_error != 0) {
  3672 			ARCSTAT_BUMP(arcstat_l2_io_error);
  3744 			ARCSTAT_BUMP(arcstat_l2_io_error);
       
  3745 		} else {
       
  3746 			zio->io_error = EIO;
       
  3747 		}
  3673 		if (!equal)
  3748 		if (!equal)
  3674 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
  3749 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
  3675 
  3750 
  3676 		zio->io_flags &= ~ZIO_FLAG_DONT_CACHE;
  3751 		if (zio->io_waiter == NULL) {
  3677 		rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp,
  3752 			/*
  3678 		    buf->b_data, zio->io_size, arc_read_done, buf,
  3753 			 * Let the resent I/O call arc_read_done() instead.
  3679 		    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb);
  3754 			 */
  3680 
  3755 			zio->io_done = NULL;
  3681 		/*
  3756 			zio->io_flags &= ~ZIO_FLAG_DONT_CACHE;
  3682 		 * Since this is a seperate thread, we can wait on this
  3757 
  3683 		 * I/O whether there is an io_waiter or not.
  3758 			rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp,
  3684 		 */
  3759 			    buf->b_data, zio->io_size, arc_read_done, buf,
  3685 		err = zio_wait(rzio);
  3760 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb);
  3686 
  3761 
  3687 		/*
  3762 			(void) zio_nowait(rzio);
  3688 		 * Let the resent I/O call arc_read_done() instead.
  3763 		}
  3689 		 * io_error is set to the reissued I/O error status.
       
  3690 		 */
       
  3691 		zio->io_done = NULL;
       
  3692 		zio->io_waiter = NULL;
       
  3693 		zio->io_error = err;
       
  3694 	}
  3764 	}
  3695 
  3765 
  3696 	kmem_free(cb, sizeof (l2arc_read_callback_t));
  3766 	kmem_free(cb, sizeof (l2arc_read_callback_t));
  3697 }
  3767 }
  3698 
  3768 
  3750 	l2arc_buf_hdr_t *abl2;
  3820 	l2arc_buf_hdr_t *abl2;
  3751 	arc_buf_hdr_t *ab, *ab_prev;
  3821 	arc_buf_hdr_t *ab, *ab_prev;
  3752 	kmutex_t *hash_lock;
  3822 	kmutex_t *hash_lock;
  3753 	uint64_t taddr;
  3823 	uint64_t taddr;
  3754 
  3824 
  3755 	ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
       
  3756 
       
  3757 	buflist = dev->l2ad_buflist;
  3825 	buflist = dev->l2ad_buflist;
  3758 
  3826 
  3759 	if (buflist == NULL)
  3827 	if (buflist == NULL)
  3760 		return;
  3828 		return;
  3761 
  3829 
  3765 		 * nothing to evict.
  3833 		 * nothing to evict.
  3766 		 */
  3834 		 */
  3767 		return;
  3835 		return;
  3768 	}
  3836 	}
  3769 
  3837 
  3770 	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * dev->l2ad_write))) {
  3838 	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
  3771 		/*
  3839 		/*
  3772 		 * When nearing the end of the device, evict to the end
  3840 		 * When nearing the end of the device, evict to the end
  3773 		 * before the device write hand jumps to the start.
  3841 		 * before the device write hand jumps to the start.
  3774 		 */
  3842 		 */
  3775 		taddr = dev->l2ad_end;
  3843 		taddr = dev->l2ad_end;
  3834 			 */
  3902 			 */
  3835 			arc_change_state(arc_anon, ab, hash_lock);
  3903 			arc_change_state(arc_anon, ab, hash_lock);
  3836 			arc_hdr_destroy(ab);
  3904 			arc_hdr_destroy(ab);
  3837 		} else {
  3905 		} else {
  3838 			/*
  3906 			/*
       
  3907 			 * Invalidate issued or about to be issued
       
  3908 			 * reads, since we may be about to write
       
  3909 			 * over this location.
       
  3910 			 */
       
  3911 			if (HDR_L2_READING(ab)) {
       
  3912 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
       
  3913 				ab->b_flags |= ARC_L2_EVICTED;
       
  3914 			}
       
  3915 
       
  3916 			/*
  3839 			 * Tell ARC this no longer exists in L2ARC.
  3917 			 * Tell ARC this no longer exists in L2ARC.
  3840 			 */
  3918 			 */
  3841 			if (ab->b_l2hdr != NULL) {
  3919 			if (ab->b_l2hdr != NULL) {
  3842 				abl2 = ab->b_l2hdr;
  3920 				abl2 = ab->b_l2hdr;
  3843 				ab->b_l2hdr = NULL;
  3921 				ab->b_l2hdr = NULL;
  3849 			/*
  3927 			/*
  3850 			 * This may have been leftover after a
  3928 			 * This may have been leftover after a
  3851 			 * failed write.
  3929 			 * failed write.
  3852 			 */
  3930 			 */
  3853 			ab->b_flags &= ~ARC_L2_WRITING;
  3931 			ab->b_flags &= ~ARC_L2_WRITING;
  3854 
       
  3855 			/*
       
  3856 			 * Invalidate issued or about to be issued
       
  3857 			 * reads, since we may be about to write
       
  3858 			 * over this location.
       
  3859 			 */
       
  3860 			if (HDR_L2_READING(ab)) {
       
  3861 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
       
  3862 				ab->b_flags |= ARC_L2_EVICTED;
       
  3863 			}
       
  3864 		}
  3932 		}
  3865 		mutex_exit(hash_lock);
  3933 		mutex_exit(hash_lock);
  3866 	}
  3934 	}
  3867 	mutex_exit(&l2arc_buflist_mtx);
  3935 	mutex_exit(&l2arc_buflist_mtx);
  3868 
  3936 
  3875  *
  3943  *
  3876  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
  3944  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
  3877  * for reading until they have completed writing.
  3945  * for reading until they have completed writing.
  3878  */
  3946  */
  3879 static void
  3947 static void
  3880 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev)
  3948 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
  3881 {
  3949 {
  3882 	arc_buf_hdr_t *ab, *ab_prev, *head;
  3950 	arc_buf_hdr_t *ab, *ab_prev, *head;
  3883 	l2arc_buf_hdr_t *hdrl2;
  3951 	l2arc_buf_hdr_t *hdrl2;
  3884 	list_t *list;
  3952 	list_t *list;
  3885 	uint64_t passed_sz, write_sz, buf_sz;
  3953 	uint64_t passed_sz, write_sz, buf_sz, headroom;
  3886 	uint64_t target_sz = dev->l2ad_write;
       
  3887 	uint64_t headroom = dev->l2ad_write * l2arc_headroom;
       
  3888 	void *buf_data;
  3954 	void *buf_data;
  3889 	kmutex_t *hash_lock, *list_lock;
  3955 	kmutex_t *hash_lock, *list_lock;
  3890 	boolean_t have_lock, full;
  3956 	boolean_t have_lock, full;
  3891 	l2arc_write_callback_t *cb;
  3957 	l2arc_write_callback_t *cb;
  3892 	zio_t *pio, *wzio;
  3958 	zio_t *pio, *wzio;
  3893 
  3959 
  3894 	ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
       
  3895 	ASSERT(dev->l2ad_vdev != NULL);
  3960 	ASSERT(dev->l2ad_vdev != NULL);
  3896 
  3961 
  3897 	pio = NULL;
  3962 	pio = NULL;
  3898 	write_sz = 0;
  3963 	write_sz = 0;
  3899 	full = B_FALSE;
  3964 	full = B_FALSE;
  3906 	mutex_enter(&l2arc_buflist_mtx);
  3971 	mutex_enter(&l2arc_buflist_mtx);
  3907 	for (int try = 0; try <= 3; try++) {
  3972 	for (int try = 0; try <= 3; try++) {
  3908 		list = l2arc_list_locked(try, &list_lock);
  3973 		list = l2arc_list_locked(try, &list_lock);
  3909 		passed_sz = 0;
  3974 		passed_sz = 0;
  3910 
  3975 
  3911 		for (ab = list_tail(list); ab; ab = ab_prev) {
  3976 		/*
  3912 			ab_prev = list_prev(list, ab);
  3977 		 * L2ARC fast warmup.
       
  3978 		 *
       
  3979 		 * Until the ARC is warm and starts to evict, read from the
       
  3980 		 * head of the ARC lists rather than the tail.
       
  3981 		 */
       
  3982 		headroom = target_sz * l2arc_headroom;
       
  3983 		if (arc_warm == B_FALSE)
       
  3984 			ab = list_head(list);
       
  3985 		else
       
  3986 			ab = list_tail(list);
       
  3987 
       
  3988 		for (; ab; ab = ab_prev) {
       
  3989 			if (arc_warm == B_FALSE)
       
  3990 				ab_prev = list_next(list, ab);
       
  3991 			else
       
  3992 				ab_prev = list_prev(list, ab);
  3913 
  3993 
  3914 			hash_lock = HDR_LOCK(ab);
  3994 			hash_lock = HDR_LOCK(ab);
  3915 			have_lock = MUTEX_HELD(hash_lock);
  3995 			have_lock = MUTEX_HELD(hash_lock);
  3916 			if (!have_lock && !mutex_tryenter(hash_lock)) {
  3996 			if (!have_lock && !mutex_tryenter(hash_lock)) {
  3917 				/*
  3997 				/*
  4030 
  4110 
  4031 	/*
  4111 	/*
  4032 	 * Bump device hand to the device start if it is approaching the end.
  4112 	 * Bump device hand to the device start if it is approaching the end.
  4033 	 * l2arc_evict() will already have evicted ahead for this case.
  4113 	 * l2arc_evict() will already have evicted ahead for this case.
  4034 	 */
  4114 	 */
  4035 	if (dev->l2ad_hand >= (dev->l2ad_end - dev->l2ad_write)) {
  4115 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
  4036 		spa_l2cache_space_update(dev->l2ad_vdev, 0,
  4116 		spa_l2cache_space_update(dev->l2ad_vdev, 0,
  4037 		    dev->l2ad_end - dev->l2ad_hand);
  4117 		    dev->l2ad_end - dev->l2ad_hand);
  4038 		dev->l2ad_hand = dev->l2ad_start;
  4118 		dev->l2ad_hand = dev->l2ad_start;
  4039 		dev->l2ad_evict = dev->l2ad_start;
  4119 		dev->l2ad_evict = dev->l2ad_start;
  4040 		dev->l2ad_first = B_FALSE;
  4120 		dev->l2ad_first = B_FALSE;
  4051 l2arc_feed_thread(void)
  4131 l2arc_feed_thread(void)
  4052 {
  4132 {
  4053 	callb_cpr_t cpr;
  4133 	callb_cpr_t cpr;
  4054 	l2arc_dev_t *dev;
  4134 	l2arc_dev_t *dev;
  4055 	spa_t *spa;
  4135 	spa_t *spa;
  4056 	int interval;
  4136 	uint64_t size;
  4057 	boolean_t startup = B_TRUE;
       
  4058 
  4137 
  4059 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
  4138 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
  4060 
  4139 
  4061 	mutex_enter(&l2arc_feed_thr_lock);
  4140 	mutex_enter(&l2arc_feed_thr_lock);
  4062 
  4141 
  4063 	while (l2arc_thread_exit == 0) {
  4142 	while (l2arc_thread_exit == 0) {
  4064 		/*
  4143 		/*
  4065 		 * Initially pause for L2ARC_FEED_DELAY seconds as a grace
  4144 		 * Pause for l2arc_feed_secs seconds between writes.
  4066 		 * interval during boot, followed by l2arc_feed_secs seconds
       
  4067 		 * thereafter.
       
  4068 		 */
  4145 		 */
  4069 		CALLB_CPR_SAFE_BEGIN(&cpr);
  4146 		CALLB_CPR_SAFE_BEGIN(&cpr);
  4070 		if (startup) {
       
  4071 			interval = L2ARC_FEED_DELAY;
       
  4072 			startup = B_FALSE;
       
  4073 		} else {
       
  4074 			interval = l2arc_feed_secs;
       
  4075 		}
       
  4076 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
  4147 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
  4077 		    lbolt + (hz * interval));
  4148 		    lbolt + (hz * l2arc_feed_secs));
  4078 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
  4149 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
  4079 
  4150 
       
  4151 		/*
       
  4152 		 * Quick check for L2ARC devices.
       
  4153 		 */
  4080 		mutex_enter(&l2arc_dev_mtx);
  4154 		mutex_enter(&l2arc_dev_mtx);
       
  4155 		if (l2arc_ndev == 0) {
       
  4156 			mutex_exit(&l2arc_dev_mtx);
       
  4157 			continue;
       
  4158 		}
       
  4159 		mutex_exit(&l2arc_dev_mtx);
  4081 
  4160 
  4082 		/*
  4161 		/*
  4083 		 * This selects the next l2arc device to write to, and in
  4162 		 * This selects the next l2arc device to write to, and in
  4084 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
  4163 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
  4085 		 * will return NULL if there are no l2arc devices or if they
  4164 		 * will return NULL if there are now no l2arc devices or if
  4086 		 * are all faulted.
  4165 		 * they are all faulted.
       
  4166 		 *
       
  4167 		 * If a device is returned, its spa's config lock is also
       
  4168 		 * held to prevent device removal.  l2arc_dev_get_next()
       
  4169 		 * will grab and release l2arc_dev_mtx.
  4087 		 */
  4170 		 */
  4088 		if ((dev = l2arc_dev_get_next()) == NULL) {
  4171 		if ((dev = l2arc_dev_get_next()) == NULL)
  4089 			mutex_exit(&l2arc_dev_mtx);
       
  4090 			continue;
  4172 			continue;
  4091 		}
  4173 
       
  4174 		spa = dev->l2ad_spa;
       
  4175 		ASSERT(spa != NULL);
  4092 
  4176 
  4093 		/*
  4177 		/*
  4094 		 * Avoid contributing to memory pressure.
  4178 		 * Avoid contributing to memory pressure.
  4095 		 */
  4179 		 */
  4096 		if (arc_reclaim_needed()) {
  4180 		if (arc_reclaim_needed()) {
  4097 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
  4181 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
  4098 			mutex_exit(&l2arc_dev_mtx);
  4182 			spa_config_exit(spa, dev);
  4099 			continue;
  4183 			continue;
  4100 		}
  4184 		}
  4101 
  4185 
  4102 		spa = dev->l2ad_spa;
       
  4103 		ASSERT(spa != NULL);
       
  4104 		ARCSTAT_BUMP(arcstat_l2_feeds);
  4186 		ARCSTAT_BUMP(arcstat_l2_feeds);
       
  4187 
       
  4188 		size = dev->l2ad_write;
       
  4189 		if (arc_warm == B_FALSE)
       
  4190 			size += dev->l2ad_boost;
  4105 
  4191 
  4106 		/*
  4192 		/*
  4107 		 * Evict L2ARC buffers that will be overwritten.
  4193 		 * Evict L2ARC buffers that will be overwritten.
  4108 		 */
  4194 		 */
  4109 		l2arc_evict(dev, dev->l2ad_write, B_FALSE);
  4195 		l2arc_evict(dev, size, B_FALSE);
  4110 
  4196 
  4111 		/*
  4197 		/*
  4112 		 * Write ARC buffers.
  4198 		 * Write ARC buffers.
  4113 		 */
  4199 		 */
  4114 		l2arc_write_buffers(spa, dev);
  4200 		l2arc_write_buffers(spa, dev, size);
  4115 		mutex_exit(&l2arc_dev_mtx);
  4201 		spa_config_exit(spa, dev);
  4116 	}
  4202 	}
  4117 
  4203 
  4118 	l2arc_thread_exit = 0;
  4204 	l2arc_thread_exit = 0;
  4119 	cv_broadcast(&l2arc_feed_thr_cv);
  4205 	cv_broadcast(&l2arc_feed_thr_cv);
  4120 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
  4206 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
  4153 	 */
  4239 	 */
  4154 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
  4240 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
  4155 	adddev->l2ad_spa = spa;
  4241 	adddev->l2ad_spa = spa;
  4156 	adddev->l2ad_vdev = vd;
  4242 	adddev->l2ad_vdev = vd;
  4157 	adddev->l2ad_write = l2arc_write_max;
  4243 	adddev->l2ad_write = l2arc_write_max;
       
  4244 	adddev->l2ad_boost = l2arc_write_boost;
  4158 	adddev->l2ad_start = start;
  4245 	adddev->l2ad_start = start;
  4159 	adddev->l2ad_end = end;
  4246 	adddev->l2ad_end = end;
  4160 	adddev->l2ad_hand = adddev->l2ad_start;
  4247 	adddev->l2ad_hand = adddev->l2ad_start;
  4161 	adddev->l2ad_evict = adddev->l2ad_start;
  4248 	adddev->l2ad_evict = adddev->l2ad_start;
  4162 	adddev->l2ad_first = B_TRUE;
  4249 	adddev->l2ad_first = B_TRUE;
  4186  */
  4273  */
  4187 void
  4274 void
  4188 l2arc_remove_vdev(vdev_t *vd)
  4275 l2arc_remove_vdev(vdev_t *vd)
  4189 {
  4276 {
  4190 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
  4277 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
  4191 
       
  4192 	/*
       
  4193 	 * We can only grab the spa config lock when cache device writes
       
  4194 	 * complete.
       
  4195 	 */
       
  4196 	ASSERT3U(l2arc_writes_sent, ==, l2arc_writes_done);
       
  4197 
  4278 
  4198 	/*
  4279 	/*
  4199 	 * Find the device by vdev
  4280 	 * Find the device by vdev
  4200 	 */
  4281 	 */
  4201 	mutex_enter(&l2arc_dev_mtx);
  4282 	mutex_enter(&l2arc_dev_mtx);
  4211 	/*
  4292 	/*
  4212 	 * Remove device from global list
  4293 	 * Remove device from global list
  4213 	 */
  4294 	 */
  4214 	list_remove(l2arc_dev_list, remdev);
  4295 	list_remove(l2arc_dev_list, remdev);
  4215 	l2arc_dev_last = NULL;		/* may have been invalidated */
  4296 	l2arc_dev_last = NULL;		/* may have been invalidated */
       
  4297 	atomic_dec_64(&l2arc_ndev);
       
  4298 	mutex_exit(&l2arc_dev_mtx);
  4216 
  4299 
  4217 	/*
  4300 	/*
  4218 	 * Clear all buflists and ARC references.  L2ARC device flush.
  4301 	 * Clear all buflists and ARC references.  L2ARC device flush.
  4219 	 */
  4302 	 */
  4220 	l2arc_evict(remdev, 0, B_TRUE);
  4303 	l2arc_evict(remdev, 0, B_TRUE);
  4221 	list_destroy(remdev->l2ad_buflist);
  4304 	list_destroy(remdev->l2ad_buflist);
  4222 	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
  4305 	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
  4223 	kmem_free(remdev, sizeof (l2arc_dev_t));
  4306 	kmem_free(remdev, sizeof (l2arc_dev_t));
  4224 
       
  4225 	atomic_dec_64(&l2arc_ndev);
       
  4226 	mutex_exit(&l2arc_dev_mtx);
       
  4227 }
  4307 }
  4228 
  4308 
  4229 void
  4309 void
  4230 l2arc_init()
  4310 l2arc_init()
  4231 {
  4311 {
  4252 }
  4332 }
  4253 
  4333 
  4254 void
  4334 void
  4255 l2arc_fini()
  4335 l2arc_fini()
  4256 {
  4336 {
       
  4337 	/*
       
  4338 	 * This is called from dmu_fini(), which is called from spa_fini();
       
  4339 	 * Because of this, we can assume that all l2arc devices have
       
  4340 	 * already been removed when the pools themselves were removed.
       
  4341 	 */
       
  4342 
  4257 	mutex_enter(&l2arc_feed_thr_lock);
  4343 	mutex_enter(&l2arc_feed_thr_lock);
  4258 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
  4344 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
  4259 	l2arc_thread_exit = 1;
  4345 	l2arc_thread_exit = 1;
  4260 	while (l2arc_thread_exit != 0)
  4346 	while (l2arc_thread_exit != 0)
  4261 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
  4347 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
  4262 	mutex_exit(&l2arc_feed_thr_lock);
  4348 	mutex_exit(&l2arc_feed_thr_lock);
  4263 
  4349 
       
  4350 	l2arc_do_free_on_write();
       
  4351 
  4264 	mutex_destroy(&l2arc_feed_thr_lock);
  4352 	mutex_destroy(&l2arc_feed_thr_lock);
  4265 	cv_destroy(&l2arc_feed_thr_cv);
  4353 	cv_destroy(&l2arc_feed_thr_cv);
  4266 	mutex_destroy(&l2arc_dev_mtx);
  4354 	mutex_destroy(&l2arc_dev_mtx);
  4267 	mutex_destroy(&l2arc_buflist_mtx);
  4355 	mutex_destroy(&l2arc_buflist_mtx);
  4268 	mutex_destroy(&l2arc_free_on_write_mtx);
  4356 	mutex_destroy(&l2arc_free_on_write_mtx);