6494473 ZFS needs a way to slow down resilvering
authorGeorge Wilson <George.Wilson@Sun.COM>
Tue, 08 Jun 2010 12:32:02 -0700
changeset 12586 b118bbd65be9
parent 12585 f47e37e9e164
child 12587 16aef3d16470
6494473 ZFS needs a way to slow down resilvering 6743992 scrub/resilver causes systemic slowdown 6936821 scrub/resilver io should not be suspended 6956464 otoro: head panic in zfs:dnode_hold_impl; during system disk zinject testing
usr/src/uts/common/fs/zfs/dnode.c
usr/src/uts/common/fs/zfs/dsl_pool.c
usr/src/uts/common/fs/zfs/dsl_scan.c
usr/src/uts/common/fs/zfs/sys/spa_impl.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/txg.c
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/zio.c
--- a/usr/src/uts/common/fs/zfs/dnode.c	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Tue Jun 08 12:32:02 2010 -0700
@@ -603,8 +603,7 @@
 	 */
 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
 	    (spa_is_root(os->os_spa) &&
-	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER) &&
-	    !spa_config_held(os->os_spa, SCL_ZIO, RW_WRITER)));
+	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
 		dn = (object == DMU_USERUSED_OBJECT) ?
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Tue Jun 08 12:32:02 2010 -0700
@@ -42,7 +42,7 @@
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
-int zfs_txg_synctime_ms = 5000;		/* target millisecs to sync a txg */
+int zfs_txg_synctime_ms = 1000;		/* target millisecs to sync a txg */
 
 uint64_t zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
 uint64_t zfs_write_limit_max = 0;		/* max data payload per txg */
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c	Tue Jun 08 12:32:02 2010 -0700
@@ -56,6 +56,11 @@
 static dsl_syncfunc_t dsl_scan_cancel_sync;
 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
 
+int zfs_top_maxinflight = 32;		/* maximum I/Os per top-level */
+int zfs_resilver_delay = 2;		/* number of ticks to delay resilver */
+int zfs_scrub_delay = 4;		/* number of ticks to delay scrub */
+int zfs_scan_idle = 50;			/* idle window in clock ticks */
+
 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
@@ -601,8 +606,8 @@
 	 * done before setting xlateall (similar to dsl_read())
 	 */
 	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
-	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
-	    &flags, &czb);
+	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 }
 
 static boolean_t
@@ -650,6 +655,7 @@
     const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
 {
 	dsl_pool_t *dp = scn->scn_dp;
+	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 	int err;
 
 	if (BP_GET_LEVEL(bp) > 0) {
@@ -660,7 +666,7 @@
 
 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
 		    arc_getbuf_func, bufp,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
@@ -683,7 +689,7 @@
 
 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
 		    arc_getbuf_func, bufp,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
@@ -696,7 +702,7 @@
 
 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
 		    arc_getbuf_func, bufp,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
@@ -719,7 +725,7 @@
 
 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
 		    arc_getbuf_func, bufp,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
@@ -1446,7 +1452,6 @@
 		dsl_scan_setup_sync(scn, &func, tx);
 	}
 
-
 	if (!dsl_scan_active(scn) ||
 	    spa_sync_pass(dp->dp_spa) > 1)
 		return;
@@ -1489,7 +1494,6 @@
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
-
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1644,8 +1648,9 @@
 	spa_t *spa = dp->dp_spa;
 	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
 	boolean_t needs_io;
-	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 	int zio_priority;
+	int scan_delay = 0;
 
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg)
@@ -1658,10 +1663,12 @@
 		zio_flags |= ZIO_FLAG_SCRUB;
 		zio_priority = ZIO_PRIORITY_SCRUB;
 		needs_io = B_TRUE;
+		scan_delay = zfs_scrub_delay;
 	} else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
 		zio_flags |= ZIO_FLAG_RESILVER;
 		zio_priority = ZIO_PRIORITY_RESILVER;
 		needs_io = B_FALSE;
+		scan_delay = zfs_resilver_delay;
 	}
 
 	/* If it's an intent log block, failure is expected. */
@@ -1699,14 +1706,23 @@
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
 		void *data = zio_data_buf_alloc(size);
 
 		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+		while (spa->spa_scrub_inflight >= maxinflight)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
+		/*
+		 * If we're seeing recent (zfs_scan_idle) "important" I/Os
+		 * then throttle our workload to limit the impact of a scan.
+		 */
+		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
+			delay(scan_delay);
+
 		zio_nowait(zio_read(NULL, spa, bp, data, size,
 		    dsl_scan_scrub_done, NULL, zio_priority,
 		    zio_flags, zb));
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Jun 08 12:32:02 2010 -0700
@@ -146,9 +146,9 @@
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
+	uint64_t	spa_last_io;		/* lbolt of last non-scan I/O */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
-	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Jun 08 12:32:02 2010 -0700
@@ -147,7 +147,7 @@
 	ZIO_FLAG_SELF_HEAL	= 1 << 2,
 	ZIO_FLAG_RESILVER	= 1 << 3,
 	ZIO_FLAG_SCRUB		= 1 << 4,
-	ZIO_FLAG_SCRUB_THREAD	= 1 << 5,
+	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
--- a/usr/src/uts/common/fs/zfs/txg.c	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/txg.c	Tue Jun 08 12:32:02 2010 -0700
@@ -37,7 +37,7 @@
 static void txg_sync_thread(dsl_pool_t *dp);
 static void txg_quiesce_thread(dsl_pool_t *dp);
 
-int zfs_txg_timeout = 30;	/* max seconds worth of delta per txg */
+int zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
 
 /*
  * Prepare the txg subsystem.
--- a/usr/src/uts/common/fs/zfs/vdev.c	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Tue Jun 08 12:32:02 2010 -0700
@@ -207,9 +207,6 @@
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
 }
 
 void
@@ -244,9 +241,6 @@
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
 }
 
 /*
@@ -2541,7 +2535,7 @@
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
-			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_phys_t *scn_phys =
 				    &spa->spa_dsl_pool->dp_scan->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
@@ -2597,7 +2591,7 @@
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
-	    (flags & ZIO_FLAG_SCRUB_THREAD) ||
+	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
@@ -2616,7 +2610,7 @@
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
-			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
--- a/usr/src/uts/common/fs/zfs/zio.c	Tue Jun 08 06:45:59 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Tue Jun 08 12:32:02 2010 -0700
@@ -2247,6 +2247,26 @@
 		return (vdev_mirror_ops.vdev_op_io_start(zio));
 	}
 
+	/*
+	 * We keep track of time-sensitive I/Os so that the scan thread
+	 * can quickly react to certain workloads.  In particular, we care
+	 * about non-scrubbing, top-level reads and writes with the following
+	 * characteristics:
+	 * 	- synchronous writes of user data to non-slog devices
+	 *	- any reads of user data
+	 * When these conditions are met, adjust the timestamp of spa_last_io
+	 * which allows the scan thread to adjust its workload accordingly.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+	    vd == vd->vdev_top && !vd->vdev_islog &&
+	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+	    zio->io_txg != spa_syncing_txg(spa)) {
+		uint64_t old = spa->spa_last_io;
+		uint64_t new = ddi_get_lbolt64();
+		if (old != new)
+			(void) atomic_cas_64(&spa->spa_last_io, old, new);
+	}
+
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (P2PHASE(zio->io_size, align) != 0) {
@@ -2744,6 +2764,7 @@
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
+		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)