6907006 zfs over-the-wire dedup recalculates checksums unnecessarily
authorLori Alt <Lori.Alt@Sun.COM>
Tue, 22 Dec 2009 11:52:16 -0700
changeset 11381 c77e4d2b2e75
parent 11380 73d454719071
child 11382 846b8e787e23
6907006 zfs over-the-wire dedup recalculates checksums unnecessarily
usr/src/cmd/zstreamdump/zstreamdump.c
usr/src/lib/libzfs/common/libzfs_sendrecv.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
--- a/usr/src/cmd/zstreamdump/zstreamdump.c	Tue Dec 22 17:10:31 2009 +0100
+++ b/usr/src/cmd/zstreamdump/zstreamdump.c	Tue Dec 22 11:52:16 2009 -0700
@@ -91,7 +91,7 @@
 	char c;
 	boolean_t verbose = B_FALSE;
 	boolean_t first = B_TRUE;
-	int i, err;
+	int err;
 	zio_cksum_t zc = { 0 };
 	zio_cksum_t pcksum = { 0 };
 
@@ -308,14 +308,20 @@
 				drrw->drr_offset = BSWAP_64(drrw->drr_offset);
 				drrw->drr_length = BSWAP_64(drrw->drr_length);
 				drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
+				drrw->drr_key.ddk_prop =
+				    BSWAP_64(drrw->drr_key.ddk_prop);
 			}
 			if (verbose) {
 				(void) printf("WRITE object = %llu type = %u "
-				    "offset = %llu length = %llu\n",
+				    "checksum type = %u\n"
+				    "offset = %llu length = %llu "
+				    "props = %llx\n",
 				    (u_longlong_t)drrw->drr_object,
 				    drrw->drr_type,
+				    drrw->drr_checksumtype,
 				    (u_longlong_t)drrw->drr_offset,
-				    (u_longlong_t)drrw->drr_length);
+				    (u_longlong_t)drrw->drr_length,
+				    (u_longlong_t)drrw->drr_key.ddk_prop);
 			}
 			(void) ssread(buf, drrw->drr_length, &zc);
 			total_write_size += drrw->drr_length;
@@ -337,13 +343,18 @@
 				    BSWAP_64(drrwbr->drr_refobject);
 				drrwbr->drr_refoffset =
 				    BSWAP_64(drrwbr->drr_refoffset);
+				drrwbr->drr_key.ddk_prop =
+				    BSWAP_64(drrwbr->drr_key.ddk_prop);
 			}
 			if (verbose) {
 				(void) printf("WRITE_BYREF object = %llu "
+				    "checksum type = %u props = %llx\n"
 				    "offset = %llu length = %llu\n"
 				    "toguid = %llx refguid = %llx\n"
 				    "refobject = %llu refoffset = %llu\n",
 				    (u_longlong_t)drrwbr->drr_object,
+				    drrwbr->drr_checksumtype,
+				    (u_longlong_t)drrwbr->drr_key.ddk_prop,
 				    (u_longlong_t)drrwbr->drr_offset,
 				    (u_longlong_t)drrwbr->drr_length,
 				    (u_longlong_t)drrwbr->drr_toguid,
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Tue Dec 22 17:10:31 2009 +0100
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Tue Dec 22 11:52:16 2009 -0700
@@ -45,6 +45,8 @@
 #include "zfs_fletcher.h"
 #include "libzfs_impl.h"
 #include <sha2.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
 
 /* in libzfs_dataset.c */
 extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
@@ -69,6 +71,7 @@
 typedef struct dedup_entry {
 	struct dedup_entry	*dde_next;
 	zio_cksum_t dde_chksum;
+	uint64_t dde_prop;
 	dataref_t dde_ref;
 } dedup_entry_t;
 
@@ -108,7 +111,7 @@
 
 static void
 ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp,
-    zio_cksum_t *cs, dataref_t *dr)
+    zio_cksum_t *cs, uint64_t prop, dataref_t *dr)
 {
 	dedup_entry_t	*dde;
 
@@ -127,6 +130,7 @@
 		assert(*ddepp == NULL);
 		dde->dde_next = NULL;
 		dde->dde_chksum = *cs;
+		dde->dde_prop = prop;
 		dde->dde_ref = *dr;
 		*ddepp = dde;
 		ddt->cur_ddt_size += sizeof (dedup_entry_t);
@@ -145,7 +149,7 @@
  */
 static boolean_t
 ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
-    dataref_t *dr)
+    uint64_t prop, dataref_t *dr)
 {
 	uint32_t hashcode;
 	dedup_entry_t **ddepp;
@@ -154,12 +158,13 @@
 
 	for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
 	    ddepp = &((*ddepp)->dde_next)) {
-		if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs)) {
+		if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) &&
+		    (*ddepp)->dde_prop == prop) {
 			*dr = (*ddepp)->dde_ref;
 			return (B_TRUE);
 		}
 	}
-	ddt_hash_append(hdl, ddt, ddepp, cs, dr);
+	ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr);
 	return (B_FALSE);
 }
 
@@ -200,7 +205,7 @@
 	struct drr_write *drrw = &thedrr.drr_u.drr_write;
 	FILE *ofp;
 	int outfd;
-	dmu_replay_record_t wbr_drr;
+	dmu_replay_record_t wbr_drr = {0};
 	struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref;
 	dedup_table_t ddt;
 	zio_cksum_t stream_cksum;
@@ -243,7 +248,8 @@
 
 			/* set the DEDUP feature flag for this stream */
 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
-			fflags |= DMU_BACKUP_FEATURE_DEDUP;
+			fflags |= (DMU_BACKUP_FEATURE_DEDUP |
+			    DMU_BACKUP_FEATURE_DEDUPPROPS);
 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
 
 			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
@@ -309,26 +315,31 @@
 			dataref_t	dataref;
 
 			(void) ssread(buf, drrw->drr_length, ofp);
+
 			/*
-			 * If the block doesn't already have a dedup
-			 * checksum, calculate one.
+			 * Use the existing checksum if it's dedup-capable,
+			 * else calculate a SHA256 checksum for it.
 			 */
-			if (ZIO_CHECKSUM_EQUAL(drrw->drr_blkcksum,
-			    zero_cksum)) {
+
+			if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
+			    zero_cksum) ||
+			    !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
 				SHA256_CTX	ctx;
 				zio_cksum_t	tmpsha256;
 
 				SHA256Init(&ctx);
 				SHA256Update(&ctx, buf, drrw->drr_length);
 				SHA256Final(&tmpsha256, &ctx);
-				drrw->drr_blkcksum.zc_word[0] =
+				drrw->drr_key.ddk_cksum.zc_word[0] =
 				    BE_64(tmpsha256.zc_word[0]);
-				drrw->drr_blkcksum.zc_word[1] =
+				drrw->drr_key.ddk_cksum.zc_word[1] =
 				    BE_64(tmpsha256.zc_word[1]);
-				drrw->drr_blkcksum.zc_word[2] =
+				drrw->drr_key.ddk_cksum.zc_word[2] =
 				    BE_64(tmpsha256.zc_word[2]);
-				drrw->drr_blkcksum.zc_word[3] =
+				drrw->drr_key.ddk_cksum.zc_word[3] =
 				    BE_64(tmpsha256.zc_word[3]);
+				drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256;
+				drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP;
 			}
 
 			dataref.ref_guid = drrw->drr_toguid;
@@ -336,7 +347,8 @@
 			dataref.ref_offset = drrw->drr_offset;
 
 			if (ddt_update(dda->dedup_hdl, &ddt,
-			    &drrw->drr_blkcksum, &dataref)) {
+			    &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
+			    &dataref)) {
 				/* block already present in stream */
 				wbr_drrr->drr_object = drrw->drr_object;
 				wbr_drrr->drr_offset = drrw->drr_offset;
@@ -348,7 +360,14 @@
 				wbr_drrr->drr_refoffset =
 				    dataref.ref_offset;
 
-				wbr_drrr->drr_blkcksum = drrw->drr_blkcksum;
+				wbr_drrr->drr_checksumtype =
+				    drrw->drr_checksumtype;
+				wbr_drrr->drr_checksumflags =
+				    drrw->drr_checksumtype;
+				wbr_drrr->drr_key.ddk_cksum =
+				    drrw->drr_key.ddk_cksum;
+				wbr_drrr->drr_key.ddk_prop =
+				    drrw->drr_key.ddk_prop;
 
 				if (cksum_and_write(&wbr_drr,
 				    sizeof (dmu_replay_record_t), &stream_cksum,
@@ -1137,7 +1156,8 @@
 		holdsnaps = B_TRUE;
 
 	if (flags.dedup) {
-		featureflags |= DMU_BACKUP_FEATURE_DEDUP;
+		featureflags |= (DMU_BACKUP_FEATURE_DEDUP |
+		    DMU_BACKUP_FEATURE_DEDUPPROPS);
 		if (err = pipe(pipefd)) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Tue Dec 22 17:10:31 2009 +0100
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Tue Dec 22 11:52:16 2009 -0700
@@ -40,6 +40,7 @@
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/avl.h>
+#include <sys/ddt.h>
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 
@@ -142,10 +143,11 @@
 
 static int
 dump_data(struct backuparg *ba, dmu_object_type_t type,
-    uint64_t object, uint64_t offset, int blksz, void *data)
+    uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
 	struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
 
+
 	/*
 	 * If there is any kind of pending aggregation (currently either
 	 * a grouping of free objects or free blocks), push it out to
@@ -165,6 +167,13 @@
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = ba->toguid;
+	drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+	if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+		drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+	DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+	DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+	DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+	drrw->drr_key.ddk_cksum = bp->blk_cksum;
 
 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
 		return (EINTR);
@@ -321,7 +330,7 @@
 			return (EIO);
 
 		err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
-		    blksz, abuf->b_data);
+		    blksz, bp, abuf->b_data);
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
@@ -873,10 +882,11 @@
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_length);
 		DO64(drr_write.drr_toguid);
-		DO64(drr_write.drr_blkcksum.zc_word[0]);
-		DO64(drr_write.drr_blkcksum.zc_word[1]);
-		DO64(drr_write.drr_blkcksum.zc_word[2]);
-		DO64(drr_write.drr_blkcksum.zc_word[3]);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+		DO64(drr_write.drr_key.ddk_prop);
 		break;
 	case DRR_WRITE_BYREF:
 		DO64(drr_write_byref.drr_object);
@@ -886,10 +896,11 @@
 		DO64(drr_write_byref.drr_refguid);
 		DO64(drr_write_byref.drr_refobject);
 		DO64(drr_write_byref.drr_refoffset);
-		DO64(drr_write_byref.drr_blkcksum.zc_word[0]);
-		DO64(drr_write_byref.drr_blkcksum.zc_word[1]);
-		DO64(drr_write_byref.drr_blkcksum.zc_word[2]);
-		DO64(drr_write_byref.drr_blkcksum.zc_word[3]);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+		DO64(drr_write_byref.drr_key.ddk_prop);
 		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Dec 22 17:10:31 2009 +0100
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h	Tue Dec 22 11:52:16 2009 -0700
@@ -70,11 +70,13 @@
  */
 
 #define	DMU_BACKUP_FEATURE_DEDUP	(0x1)
+#define	DMU_BACKUP_FEATURE_DEDUPPROPS	(0x2)
 
 /*
  * Mask of all supported backup features
  */
-#define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP)
+#define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP | \
+		DMU_BACKUP_FEATURE_DEDUPPROPS)
 
 /* Are all features in the given flag word currently supported? */
 #define	DMU_STREAM_SUPPORTED(x)	(!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -102,6 +104,14 @@
 #define	DRR_FLAG_CI_DATA	(1<<1)
 
 /*
+ * flags in the drr_checksumflags field in the DRR_WRITE and
+ * DRR_WRITE_BYREF blocks
+ */
+#define	DRR_CHECKSUM_DEDUP	(1<<0)
+
+#define	DRR_IS_DEDUP_CAPABLE(flags)	((flags) & DRR_CHECKSUM_DEDUP)
+
+/*
  * zfs ioctl command structure
  */
 typedef struct dmu_replay_record {
@@ -151,8 +161,9 @@
 			uint64_t drr_length;
 			uint64_t drr_toguid;
 			uint8_t drr_checksumtype;
-			uint8_t drr_pad2[7];
-			zio_cksum_t drr_blkcksum;
+			uint8_t drr_checksumflags;
+			uint8_t drr_pad2[6];
+			ddt_key_t drr_key; /* deduplication key */
 			/* content follows */
 		} drr_write;
 		struct drr_free {
@@ -171,9 +182,11 @@
 			uint64_t drr_refguid;
 			uint64_t drr_refobject;
 			uint64_t drr_refoffset;
+			/* properties of the data */
 			uint8_t drr_checksumtype;
-			uint8_t drr_pad[7];
-			zio_cksum_t drr_blkcksum;
+			uint8_t drr_checksumflags;
+			uint8_t drr_pad2[6];
+			ddt_key_t drr_key; /* deduplication key */
 		} drr_write_byref;
 	} drr_u;
 } dmu_replay_record_t;