6646775 Speed up the dumpifying process for zvols
authorTim Haley <Tim.Haley@Sun.COM>
Fri, 17 Oct 2008 16:50:52 -0600
changeset 7872 40a9434212f6
parent 7871 7095c02af645
child 7873 f69a0edc8643
6646775 Speed up the dumpifying process for zvols
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_tx.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/dbuf.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/zio.c
usr/src/uts/common/fs/zfs/zvol.c
--- a/usr/src/uts/common/fs/zfs/arc.c	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Fri Oct 17 16:50:52 2008 -0600
@@ -3077,7 +3077,7 @@
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
-static void
+void
 write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
 {
 	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Fri Oct 17 16:50:52 2008 -0600
@@ -40,6 +40,8 @@
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static arc_done_func_t dbuf_write_ready;
 static arc_done_func_t dbuf_write_done;
+static zio_done_func_t dbuf_skip_write_ready;
+static zio_done_func_t dbuf_skip_write_done;
 
 /*
  * Global data structures and functions for the dbuf cache.
@@ -396,7 +398,8 @@
 	} else {
 		dbuf_evict_user(db);
 		db->db.db_data = NULL;
-		db->db_state = DB_UNCACHED;
+		if (db->db_state != DB_NOFILL)
+			db->db_state = DB_UNCACHED;
 	}
 }
 
@@ -537,6 +540,9 @@
 	 */
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
+	if (db->db_state == DB_NOFILL)
+		return (EIO);
+
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
 
@@ -612,6 +618,8 @@
 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
 		    db->db.db_size, db, type));
 		db->db_state = DB_FILL;
+	} else if (db->db_state == DB_NOFILL) {
+		dbuf_set_data(db, NULL);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
@@ -755,6 +763,7 @@
 
 		mutex_enter(&db->db_mtx);
 		if (db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
@@ -924,7 +933,8 @@
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
-	    db->db_state == DB_CACHED || db->db_state == DB_FILL);
+	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	/*
@@ -1011,22 +1021,26 @@
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
-		if (db->db_blkid == DB_BONUS_BLKID) {
-			dbuf_fix_old_data(db, tx->tx_txg);
-			data_old = db->db.db_data;
-		} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
-			/*
-			 * Release the data buffer from the cache so that we
-			 * can modify it without impacting possible other users
-			 * of this cached data block.  Note that indirect
-			 * blocks and private objects are not released until the
-			 * syncing state (since they are only modified then).
-			 */
-			arc_release(db->db_buf, db);
-			dbuf_fix_old_data(db, tx->tx_txg);
-			data_old = db->db_buf;
+		if (db->db_state != DB_NOFILL) {
+			if (db->db_blkid == DB_BONUS_BLKID) {
+				dbuf_fix_old_data(db, tx->tx_txg);
+				data_old = db->db.db_data;
+			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+				/*
+				 * Release the data buffer from the cache so
+				 * that we can modify it without impacting
+				 * possible other users of this cached data
+				 * block.  Note that indirect blocks and
+				 * private objects are not released until the
+				 * syncing state (since they are only modified
+				 * then).
+				 */
+				arc_release(db->db_buf, db);
+				dbuf_fix_old_data(db, tx->tx_txg);
+				data_old = db->db_buf;
+			}
+			ASSERT(data_old != NULL);
 		}
-		ASSERT(data_old != NULL);
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -1199,12 +1213,15 @@
 	}
 
 	if (db->db_level == 0) {
-		dbuf_unoverride(dr);
+		if (db->db_state != DB_NOFILL) {
+			dbuf_unoverride(dr);
 
-		ASSERT(db->db_buf != NULL);
-		ASSERT(dr->dt.dl.dr_data != NULL);
-		if (dr->dt.dl.dr_data != db->db_buf)
-			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+			ASSERT(db->db_buf != NULL);
+			ASSERT(dr->dt.dl.dr_data != NULL);
+			if (dr->dt.dl.dr_data != db->db_buf)
+				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+				    db) == 1);
+		}
 	} else {
 		ASSERT(db->db_buf != NULL);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -1246,6 +1263,16 @@
 }
 
 void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	db->db_state = DB_NOFILL;
+
+	dmu_buf_will_fill(db_fake, tx);
+}
+
+void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -1320,7 +1347,7 @@
 		db->db_state = DB_UNCACHED;
 	}
 
-	ASSERT3U(db->db_state, ==, DB_UNCACHED);
+	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 
 	db->db_state = DB_EVICTING;
@@ -1745,7 +1772,8 @@
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
-			ASSERT3U(db->db_state, ==, DB_UNCACHED);
+			ASSERT(db->db_state == DB_UNCACHED ||
+			    db->db_state == DB_NOFILL);
 			dbuf_evict(db);
 		} else if (arc_released(db->db_buf)) {
 			arc_buf_t *buf = db->db_buf;
@@ -1933,7 +1961,7 @@
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else {
-		ASSERT3U(db->db_state, ==, DB_CACHED);
+		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
@@ -2021,26 +2049,33 @@
 		return;
 	}
 
-	blksz = arc_buf_size(*datap);
+	if (db->db_state != DB_NOFILL) {
+		blksz = arc_buf_size(*datap);
 
-	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
-		/*
-		 * If this buffer is currently "in use" (i.e., there are
-		 * active holds and db_data still references it), then make
-		 * a copy before we start the write so that any modifications
-		 * from the open txg will not leak into this write.
-		 *
-		 * NOTE: this copy does not need to be made for objects only
-		 * modified in the syncing context (e.g. DNONE_DNODE blocks).
-		 */
-		if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
-			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
-			bcopy(db->db.db_data, (*datap)->b_data, blksz);
+		if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+			/*
+			 * If this buffer is currently "in use" (i.e., there
+			 * are active holds and db_data still references it),
+			 * then make a copy before we start the write so that
+			 * any modifications from the open txg will not leak
+			 * into this write.
+			 *
+			 * NOTE: this copy does not need to be made for
+			 * objects only modified in the syncing context (e.g.
+			 * DNONE_DNODE blocks).
+			 */
+			if (refcount_count(&db->db_holds) > 1 &&
+			    *datap == db->db_buf) {
+				arc_buf_contents_t type =
+				    DBUF_GET_BUFC_TYPE(db);
+				*datap =
+				    arc_buf_alloc(os->os_spa, blksz, db, type);
+				bcopy(db->db.db_data, (*datap)->b_data, blksz);
+			}
 		}
+
+		ASSERT(*datap != NULL);
 	}
-
-	ASSERT(*datap != NULL);
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
@@ -2101,7 +2136,7 @@
 		 * overhead of making multiple copies of the data.
 		 */
 		arc_release(data, db);
-	} else {
+	} else if (db->db_state != DB_NOFILL) {
 		ASSERT(arc_released(data));
 		/* XXX why do we need to thaw here? */
 		arc_buf_thaw(data);
@@ -2140,10 +2175,40 @@
 		(void) dsl_dataset_block_kill(
 		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
 
-	dr->dr_zio = arc_write(zio, os->os_spa, &wp,
-	    DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
-	    data, dbuf_write_ready, dbuf_write_done, db,
-	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	if (db->db_state == DB_NOFILL) {
+		zio_prop_t zp = { 0 };
+
+		write_policy(os->os_spa, &wp, &zp);
+		dr->dr_zio = zio_write(zio, os->os_spa,
+		    txg, db->db_blkptr, NULL,
+		    db->db.db_size, &zp, dbuf_skip_write_ready,
+		    dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_MUSTSUCCEED, &zb);
+	} else {
+		dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+		    DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
+		    data, dbuf_write_ready, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	}
+}
+
+/* wrapper function for dbuf_write_ready bypassing ARC */
+static void
+dbuf_skip_write_ready(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (!BP_IS_GANG(bp))
+		zio_skip_write(zio);
+
+	dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+/* wrapper function for dbuf_write_done bypassing ARC */
+static void
+dbuf_skip_write_done(zio_t *zio)
+{
+	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 /* ARGSUSED */
@@ -2251,12 +2316,15 @@
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 
-		if (dr->dt.dl.dr_data != db->db_buf)
-			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
-		else if (!BP_IS_HOLE(db->db_blkptr))
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
-		else
-			ASSERT(arc_released(db->db_buf));
+		if (db->db_state != DB_NOFILL) {
+			if (dr->dt.dl.dr_data != db->db_buf)
+				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+				    db) == 1);
+			else if (!BP_IS_HOLE(db->db_blkptr))
+				arc_set_callback(db->db_buf, dbuf_do_evict, db);
+			else
+				ASSERT(arc_released(db->db_buf));
+		}
 	} else {
 		dnode_t *dn = db->db_dnode;
 
--- a/usr/src/uts/common/fs/zfs/dmu.c	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Fri Oct 17 16:50:52 2008 -0600
@@ -638,6 +638,27 @@
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	if (size == 0)
+		return;
+
+	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp));
+
+	for (i = 0; i < numbufs; i++) {
+		dmu_buf_t *db = dbp[i];
+
+		dmu_buf_will_not_fill(db, tx);
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
 #ifdef _KERNEL
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
@@ -991,7 +1012,6 @@
 	zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
 	    txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-
 	if (pio) {
 		zio_nowait(zio);
 		err = EINPROGRESS;
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c	Fri Oct 17 16:50:52 2008 -0600
@@ -177,7 +177,6 @@
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
-
 	/*
 	 * For i/o error checking, read the first and last level-0
 	 * blocks (if they are not aligned), and all the level-1 blocks.
@@ -185,9 +184,12 @@
 
 	if (dn) {
 		if (dn->dn_maxblkid == 0) {
-			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-			if (err)
-				goto out;
+			if ((off > 0 || len < dn->dn_datablksz) &&
+			    off < dn->dn_datablksz) {
+				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+				if (err)
+					goto out;
+			}
 		} else {
 			zio_t *zio = zio_root(dn->dn_objset->os_spa,
 			    NULL, NULL, ZIO_FLAG_CANFAIL);
@@ -203,7 +205,7 @@
 
 			/* last level-0 block */
 			end = (off+len-1) >> dn->dn_datablkshift;
-			if (end != start &&
+			if (end != start && end <= dn->dn_maxblkid &&
 			    P2PHASE(off+len, dn->dn_datablksz)) {
 				err = dmu_tx_check_ioerr(zio, dn, 0, end);
 				if (err)
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Oct 17 16:50:52 2008 -0600
@@ -94,6 +94,7 @@
 	uint8_t wp_dnchecksum, wp_oschecksum;
 } writeprops_t;
 
+void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp);
 int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Fri Oct 17 16:50:52 2008 -0600
@@ -26,8 +26,6 @@
 #ifndef	_SYS_DBUF_H
 #define	_SYS_DBUF_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
@@ -55,19 +53,23 @@
 #define	DB_RF_CACHED		(1 << 5)
 
 /*
- * The state transition diagram for dbufs looks like:
+ * The simplified state transition diagram for dbufs looks like:
  *
  *		+----> READ ----+
  *		|		|
  *		|		V
  *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
- *		|		^
- *		|		|
- *		+----> FILL ----+
+ *		|		^	 ^
+ *		|		|	 |
+ *		+----> FILL ----+	 |
+ *		|			 |
+ *		|			 |
+ *		+--------> NOFILL -------+
  */
 typedef enum dbuf_states {
 	DB_UNCACHED,
 	DB_FILL,
+	DB_NOFILL,
 	DB_READ,
 	DB_CACHED,
 	DB_EVICTING
@@ -258,8 +260,8 @@
 
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri Oct 17 16:50:52 2008 -0600
@@ -26,8 +26,6 @@
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
@@ -451,6 +449,8 @@
 	void *buf);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
+void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Fri Oct 17 16:50:52 2008 -0600
@@ -337,6 +337,8 @@
     void *data, uint64_t size, zio_done_func_t *done, void *private,
     int priority, int flags, zbookmark_t *zb);
 
+extern void zio_skip_write(zio_t *zio);
+
 extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private, int flags);
 
--- a/usr/src/uts/common/fs/zfs/zio.c	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Fri Oct 17 16:50:52 2008 -0600
@@ -512,6 +512,16 @@
 	return (zio);
 }
 
+void
+zio_skip_write(zio_t *zio)
+{
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(zio->io_stage == ZIO_STAGE_READY);
+	ASSERT(!BP_IS_GANG(zio->io_bp));
+
+	zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+}
+
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, zio_prop_t *zp,
--- a/usr/src/uts/common/fs/zfs/zvol.c	Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Fri Oct 17 16:50:52 2008 -0600
@@ -623,7 +623,6 @@
 {
 	objset_t *os = zv->zv_objset;
 	dmu_tx_t *tx;
-	void *data;
 	uint64_t refd, avail, usedobjs, availobjs;
 	uint64_t resid = zv->zv_volsize;
 	uint64_t off = 0;
@@ -636,9 +635,6 @@
 	/* Free old extents if they exist */
 	zvol_free_extents(zv);
 
-	/* allocate the blocks by writing each one */
-	data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
-
 	while (resid != 0) {
 		int error;
 		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
@@ -648,16 +644,14 @@
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
-			kmem_free(data, SPA_MAXBLOCKSIZE);
 			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 			return (error);
 		}
-		dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
+		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 		dmu_tx_commit(tx);
 		off += bytes;
 		resid -= bytes;
 	}
-	kmem_free(data, SPA_MAXBLOCKSIZE);
 	txg_wait_synced(dmu_objset_pool(os), 0);
 
 	return (0);