6678033 resilver code should prefetch
authorGeorge Wilson <George.Wilson@Sun.COM>
Sat, 21 Nov 2009 23:44:56 -0800
changeset 11147 74e8c05021f1
parent 11146 7e58f40bcb1c
child 11148 68adfb531269
6678033 resilver code should prefetch 6841580 zfs stack overflow when upgrading to userspace accounting 6859446 scrub doesn't pause correctly
usr/src/cmd/mdb/common/modules/zfs/zfs.c
usr/src/uts/common/fs/zfs/dsl_pool.c
usr/src/uts/common/fs/zfs/dsl_scrub.c
usr/src/uts/common/fs/zfs/spa_errlog.c
usr/src/uts/common/fs/zfs/spa_misc.c
usr/src/uts/common/fs/zfs/sys/dsl_pool.h
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/txg.c
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/zfs_fm.c
usr/src/uts/common/fs/zfs/zfs_ioctl.c
usr/src/uts/common/fs/zfs/zio.c
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Sat Nov 21 23:44:56 2009 -0800
@@ -313,6 +313,8 @@
 		"zfs_vdev_max_pending",
 		"zfs_vdev_min_pending",
 		"zfs_scrub_limit",
+		"zfs_no_scrub_io",
+		"zfs_no_scrub_prefetch",
 		"zfs_vdev_time_shift",
 		"zfs_vdev_ramp_rate",
 		"zfs_vdev_aggregation_limit",
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Sat Nov 21 23:44:56 2009 -0800
@@ -382,8 +382,12 @@
 		dsl_dir_sync(dd, tx);
 	write_time += gethrtime() - start;
 
-	if (spa_sync_pass(dp->dp_spa) == 1)
+	if (spa_sync_pass(dp->dp_spa) == 1) {
+		dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_CANFAIL);
 		dsl_pool_scrub_sync(dp, tx);
+		(void) zio_wait(dp->dp_scrub_prefetch_zio_root);
+	}
 
 	start = gethrtime();
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c	Sat Nov 21 23:44:56 2009 -0800
@@ -53,6 +53,7 @@
 int zfs_scrub_min_time = 1000; /* (millisec) min time to scrub per txg */
 int zfs_resilver_min_time = 3000; /* (millisec) min time to resilver per txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 
 extern int zfs_txg_timeout;
@@ -450,6 +451,27 @@
 }
 
 static void
+scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset,
+    uint64_t object, uint64_t blkid)
+{
+	zbookmark_t czb;
+	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+	if (zfs_no_scrub_prefetch)
+		return;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+		return;
+
+	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+	(void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp,
+	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+	    &flags, &czb);
+}
+
+static void
 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
 {
@@ -487,6 +509,13 @@
 		}
 	}
 
+	/*
+	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
+	 * don't scrub it again.
+	 */
+	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
+		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+
 	if (BP_GET_LEVEL(bp) > 0) {
 		uint32_t flags = ARC_WAIT;
 		int i;
@@ -502,9 +531,11 @@
 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
 			return;
 		}
-		cbp = buf->b_data;
-
-		for (i = 0; i < epb; i++, cbp++) {
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			scrub_prefetch(dp, buf, cbp, zb->zb_objset,
+			    zb->zb_object, zb->zb_blkid * epb + i);
+		}
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			zbookmark_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
@@ -514,8 +545,8 @@
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
-		dnode_phys_t *child_dnp;
-		int i;
+		dnode_phys_t *cdnp;
+		int i, j;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
@@ -527,10 +558,15 @@
 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
 			return;
 		}
-		child_dnp = buf->b_data;
-
-		for (i = 0; i < epb; i++, child_dnp++) {
-			scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			for (j = 0; j < cdnp->dn_nblkptr; j++) {
+				blkptr_t *cbp = &cdnp->dn_blkptr[j];
+				scrub_prefetch(dp, buf, cbp, zb->zb_objset,
+				    zb->zb_blkid * epb + i, j);
+			}
+		}
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			scrub_visitdnode(dp, cdnp, buf, zb->zb_objset,
 			    zb->zb_blkid * epb + i);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
@@ -561,13 +597,6 @@
 		}
 	}
 
-	/*
-	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
-	 * don't scrub it again.
-	 */
-	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
-		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
-
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 }
@@ -887,7 +916,7 @@
 	/*
 	 * If the pool is not loaded, or is trying to unload, leave it alone.
 	 */
-	if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
+	if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa))
 		return;
 
 	if (dp->dp_scrub_restart) {
--- a/usr/src/uts/common/fs/zfs/spa_errlog.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c	Sat Nov 21 23:44:56 2009 -0800
@@ -132,7 +132,7 @@
 	 * If we are trying to import a pool, ignore any errors, as we won't be
 	 * writing to the pool any time soon.
 	 */
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return;
 
 	mutex_enter(&spa->spa_errlist_lock);
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Sat Nov 21 23:44:56 2009 -0800
@@ -1236,6 +1236,12 @@
 	return (spa->spa_state);
 }
 
+spa_load_state_t
+spa_load_state(spa_t *spa)
+{
+	return (spa->spa_load_state);
+}
+
 uint64_t
 spa_freeze_txg(spa_t *spa)
 {
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Sat Nov 21 23:44:56 2009 -0800
@@ -105,6 +105,7 @@
 	boolean_t dp_scrub_isresilver;
 	boolean_t dp_scrub_restart;
 	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
+	zio_t *dp_scrub_prefetch_zio_root;
 
 	/* Has its own locking */
 	tx_state_t dp_tx;
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Sat Nov 21 23:44:56 2009 -0800
@@ -555,6 +555,7 @@
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
--- a/usr/src/uts/common/fs/zfs/txg.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/txg.c	Sat Nov 21 23:44:56 2009 -0800
@@ -353,6 +353,7 @@
 static void
 txg_sync_thread(dsl_pool_t *dp)
 {
+	spa_t *spa = dp->dp_spa;
 	tx_state_t *tx = &dp->dp_tx;
 	callb_cpr_t cpr;
 	uint64_t start, delta;
@@ -371,7 +372,8 @@
 		 */
 		timer = (delta >= timeout ? 0 : timeout - delta);
 		while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
-		    spa_shutting_down(dp->dp_spa)) &&
+		    spa_load_state(spa) != SPA_LOAD_NONE ||
+		    spa_shutting_down(spa)) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 		    tx->tx_quiesced_txg == 0) {
@@ -411,7 +413,7 @@
 		mutex_exit(&tx->tx_sync_lock);
 
 		start = ddi_get_lbolt();
-		spa_sync(dp->dp_spa, txg);
+		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
 
 		mutex_enter(&tx->tx_sync_lock);
--- a/usr/src/uts/common/fs/zfs/vdev.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Sat Nov 21 23:44:56 2009 -0800
@@ -529,7 +529,7 @@
 		 * valid in the current context.  Local vdevs will
 		 * remain in the faulted state.
 		 */
-		if (spa->spa_load_state == SPA_LOAD_OPEN) {
+		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
@@ -1345,7 +1345,7 @@
 		 * state of the pool.
 		 */
 		if (!spa->spa_load_verbatim &&
-		    spa->spa_load_state == SPA_LOAD_OPEN &&
+		    spa_load_state(spa) == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
 			return (EBADF);
 
@@ -2900,7 +2900,7 @@
 		 * begin with.  Failure to open such a device is not considered
 		 * an error.
 		 */
-		if (spa->spa_load_state == SPA_LOAD_IMPORT &&
+		if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
--- a/usr/src/uts/common/fs/zfs/zfs_fm.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c	Sat Nov 21 23:44:56 2009 -0800
@@ -112,8 +112,8 @@
 	 * If we are doing a spa_tryimport() or in recovery mode,
 	 * ignore errors.
 	 */
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT ||
-	    spa->spa_load_state == SPA_LOAD_RECOVER)
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+	    spa_load_state(spa) == SPA_LOAD_RECOVER)
 		return;
 
 	/*
@@ -121,7 +121,7 @@
 	 * failed, don't bother logging any new ereports - we're just going to
 	 * get the same diagnosis anyway.
 	 */
-	if (spa->spa_load_state != SPA_LOAD_NONE &&
+	if (spa_load_state(spa) != SPA_LOAD_NONE &&
 	    spa->spa_last_open_failed)
 		return;
 
@@ -202,7 +202,7 @@
 	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
 	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
 	 */
-	if (spa->spa_load_state != SPA_LOAD_NONE) {
+	if (spa_load_state(spa) != SPA_LOAD_NONE) {
 		if (spa->spa_ena == 0)
 			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
 		ena = spa->spa_ena;
@@ -238,7 +238,7 @@
 	    DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 	    DATA_TYPE_UINT64, spa_guid(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
-	    spa->spa_load_state, NULL);
+	    spa_load_state(spa), NULL);
 
 	if (spa != NULL) {
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
@@ -805,7 +805,7 @@
 	nvlist_t *resource;
 	char class[64];
 
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return;
 
 	if ((resource = fm_nvlist_create(NULL)) == NULL)
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Sat Nov 21 23:44:56 2009 -0800
@@ -1890,9 +1890,12 @@
 		zfsvfs_rele(zfsvfs, FTAG);
 
 		if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
-			zfs_cmd_t zc = { 0 };
-			(void) strcpy(zc.zc_name, dsname);
-			(void) zfs_ioc_userspace_upgrade(&zc);
+			zfs_cmd_t *zc;
+
+			zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+			(void) strcpy(zc->zc_name, dsname);
+			(void) zfs_ioc_userspace_upgrade(zc);
+			kmem_free(zc, sizeof (zfs_cmd_t));
 		}
 		break;
 	}
--- a/usr/src/uts/common/fs/zfs/zio.c	Sat Nov 21 22:51:29 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Sat Nov 21 23:44:56 2009 -0800
@@ -1785,7 +1785,7 @@
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
-			ASSERT(zio->io_spa->spa_load_state != SPA_LOAD_NONE);
+			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 		if (dde == NULL) {
@@ -2711,7 +2711,7 @@
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    zio->io_error == ENXIO &&
-		    spa->spa_load_state == SPA_LOAD_NONE &&
+		    spa_load_state(spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;