6490569 verify arc bufs are not modified when they shouldn't be
authorahrens
Sat, 11 Nov 2006 09:04:24 -0800
changeset 3093 71525e4187d5
parent 3092 959eb57579ac
child 3094 497bbbb1a2e6
6490569 verify arc bufs are not modified when they shouldn't be
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dnode_sync.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/dbuf.h
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/sys/zfs_debug.h
usr/src/uts/common/fs/zfs/zio_checksum.c
--- a/usr/src/uts/common/fs/zfs/arc.c	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c	Sat Nov 11 09:04:24 2006 -0800
@@ -113,6 +113,7 @@
 
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
@@ -239,6 +240,9 @@
 	uint64_t		b_birth;
 	uint64_t		b_cksum0;
 
+	kmutex_t		b_freeze_lock;
+	zio_cksum_t		*b_freeze_cksum;
+
 	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
 	uint32_t		b_flags;
@@ -542,6 +546,69 @@
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+	zio_cksum_t zc;
+
+	if (!zfs_flags & ZFS_DEBUG_MODIFY)
+		return;
+
+	mutex_enter(&buf->b_hdr->b_freeze_lock);
+	if (buf->b_hdr->b_freeze_cksum == NULL) {
+		mutex_exit(&buf->b_hdr->b_freeze_lock);
+		return;
+	}
+	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+		panic("buffer modified while frozen!");
+	mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+static void
+arc_cksum_compute(arc_buf_t *buf)
+{
+	if (!zfs_flags & ZFS_DEBUG_MODIFY)
+		return;
+
+	mutex_enter(&buf->b_hdr->b_freeze_lock);
+	if (buf->b_hdr->b_freeze_cksum != NULL) {
+		mutex_exit(&buf->b_hdr->b_freeze_lock);
+		return;
+	}
+	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
+	    buf->b_hdr->b_freeze_cksum);
+	mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+	if (!zfs_flags & ZFS_DEBUG_MODIFY)
+		return;
+
+	if (buf->b_hdr->b_state != arc.anon)
+		panic("modifying non-anon buffer!");
+	if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+		panic("modifying buffer while i/o in progress!");
+	arc_cksum_verify(buf);
+	mutex_enter(&buf->b_hdr->b_freeze_lock);
+	if (buf->b_hdr->b_freeze_cksum != NULL) {
+		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+		buf->b_hdr->b_freeze_cksum = NULL;
+	}
+	mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
+	    buf->b_hdr->b_state == arc.anon);
+	arc_cksum_compute(buf);
+}
+
+static void
 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
 {
 	ASSERT(MUTEX_HELD(hash_lock));
@@ -772,6 +839,7 @@
 		arc_state_t *state = buf->b_hdr->b_state;
 		uint64_t size = buf->b_hdr->b_size;
 
+		arc_cksum_verify(buf);
 		if (!recycle) {
 			zio_buf_free(buf->b_data, size);
 			atomic_add_64(&arc.size, -size);
@@ -834,6 +902,10 @@
 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
 		}
 	}
+	if (hdr->b_freeze_cksum != NULL) {
+		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+		hdr->b_freeze_cksum = NULL;
+	}
 
 	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
@@ -1671,7 +1743,7 @@
 	 * read.
 	 */
 	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
-		    &hash_lock);
+	    &hash_lock);
 
 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
@@ -1682,6 +1754,8 @@
 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
 
+	arc_cksum_compute(buf);
+
 	/* create copies of the data buffer for the callers */
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2106,6 +2180,7 @@
 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
 		ASSERT(BUF_EMPTY(hdr));
 		ASSERT(buf->b_efunc == NULL);
+		arc_buf_thaw(buf);
 		return;
 	}
 
@@ -2149,6 +2224,9 @@
 		nhdr->b_arc_access = 0;
 		nhdr->b_flags = 0;
 		nhdr->b_datacnt = 1;
+		nhdr->b_freeze_cksum =
+		    kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+		*nhdr->b_freeze_cksum = *hdr->b_freeze_cksum; /* struct copy */
 		buf->b_hdr = nhdr;
 		buf->b_next = NULL;
 		(void) refcount_add(&nhdr->b_refcnt, tag);
@@ -2168,6 +2246,7 @@
 	}
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
+	arc_buf_thaw(buf);
 }
 
 int
@@ -2219,6 +2298,8 @@
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
+		arc_cksum_verify(buf);
+
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists) {
 			/*
@@ -2287,6 +2368,7 @@
 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
 	hdr->b_acb = acb;
 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
+	arc_cksum_compute(buf);
 	rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
 
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Sat Nov 11 09:04:24 2006 -0800
@@ -448,6 +448,7 @@
 		/* we were freed in flight; disregard any error */
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
+		arc_buf_freeze(buf);
 		db->db_d.db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
@@ -647,8 +648,8 @@
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
-	quiescing = (arc_buf_t **)&db->db_d.db_data_old[(txg-1)&TXG_MASK];
-	syncing = (arc_buf_t **)&db->db_d.db_data_old[(txg-2)&TXG_MASK];
+	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
+	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
 
 	/*
 	 * If this buffer is referenced from the current quiescing
@@ -701,7 +702,7 @@
 static void
 dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg)
 {
-	void **quiescing, **syncing;
+	arc_buf_t **quiescing, **syncing;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
@@ -742,7 +743,14 @@
 		kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
 		    sizeof (blkptr_t));
 		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
-		/* release the already-written buffer */
+		/*
+		 * Release the already-written buffer, so we leave it in
+		 * a consistent dirty state.  Note that all callers are
+		 * modifying the buffer, so they will immediately do
+		 * another (redundant) arc_release().  Therefore, leave
+		 * the buf thawed to save the effort of freezing &
+		 * immediately re-thawing it.
+		 */
 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
 	}
 }
@@ -811,6 +819,7 @@
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			bzero(db->db.db_data, db->db.db_size);
+			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
@@ -959,6 +968,10 @@
 	 * If this buffer is already dirty, we're done.
 	 */
 	if (list_link_active(&db->db_dirty_node[txgoff])) {
+		if (db->db_blkid != DB_BONUS_BLKID && db->db_level == 0 &&
+		    db->db.db_object != DMU_META_DNODE_OBJECT)
+			arc_buf_thaw(db->db_buf);
+
 		mutex_exit(&db->db_mtx);
 		return;
 	}
@@ -1647,6 +1660,9 @@
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
+	if (db->db_buf && holds == db->db_dirtycnt)
+		arc_buf_freeze(db->db_buf);
+
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_d.db_immediate_evict)
 		dbuf_evict_user(db);
@@ -1662,7 +1678,7 @@
 			 */
 			ASSERT3U(db->db_state, ==, DB_UNCACHED);
 			dbuf_evict(db);
-		} else  if (arc_released(db->db_buf)) {
+		} else if (arc_released(db->db_buf)) {
 			arc_buf_t *buf = db->db_buf;
 			/*
 			 * This dbuf has anonymous data associated with it.
@@ -1777,7 +1793,7 @@
 	 */
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
-		void **datap = &db->db_d.db_data_old[txg&TXG_MASK];
+		arc_buf_t **datap = &db->db_d.db_data_old[txg&TXG_MASK];
 		/*
 		 * Simply copy the bonus data into the dnode.  It will
 		 * be written out when the dnode is synced (and it will
@@ -1807,7 +1823,7 @@
 	}
 
 	if (db->db_level == 0) {
-		data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
+		data = &db->db_d.db_data_old[txg&TXG_MASK];
 		blksz = arc_buf_size(*data);
 
 		/*
@@ -1963,7 +1979,7 @@
 		 * We may have read this indirect block after we dirtied it,
 		 * so never released it from the cache.
 		 */
-		arc_release(parent->db_buf, db->db_parent);
+		arc_release(parent->db_buf, parent);
 
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
@@ -1984,8 +2000,7 @@
 
 	if (db->db_level == 0 &&
 	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
-		arc_buf_t **old =
-		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
+		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
 		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
 		int old_size = bp_get_dasize(os->os_spa, db->db_blkptr);
 		int new_size = bp_get_dasize(os->os_spa, *bpp);
@@ -2130,8 +2145,7 @@
 		db->db_dirtied = 0;
 
 	if (db->db_level == 0) {
-		arc_buf_t **old =
-		    (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
+		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
 
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Sat Nov 11 09:04:24 2006 -0800
@@ -896,10 +896,7 @@
 			 * everything before the DRR_END record.
 			 */
 			if (drre.drr_checksum.zc_word[0] != 0 &&
-			    ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) |
-			    (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) |
-			    (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) |
-			    (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) {
+			    !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
 				ra.err = ECKSUM;
 				goto out;
 			}
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Sat Nov 11 09:04:24 2006 -0800
@@ -62,6 +62,7 @@
 		    db->db.db_size);
 		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
 		    sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+		arc_buf_freeze(db->db_buf);
 	}
 
 	dn->dn_phys->dn_nlevels += 1;
@@ -164,8 +165,7 @@
 
 		/* db_data_old better be zeroed */
 		if (child->db_d.db_data_old[txg & TXG_MASK]) {
-			buf = ((arc_buf_t *)child->db_d.db_data_old
-			    [txg & TXG_MASK])->b_data;
+			buf = child->db_d.db_data_old[txg & TXG_MASK]->b_data;
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
@@ -238,6 +238,7 @@
 	if (db->db_level == 1) {
 		FREE_VERIFY(db, start, end, tx);
 		free_blocks(dn, bp, end-start+1, tx);
+		arc_buf_freeze(db->db_buf);
 		ASSERT(all || list_link_active(&db->db_dirty_node[txgoff]));
 		return (all);
 	}
@@ -258,6 +259,7 @@
 		}
 		dbuf_rele(subdb, FTAG);
 	}
+	arc_buf_freeze(db->db_buf);
 #ifdef ZFS_DEBUG
 	bp -= (end-start)+1;
 	for (i = start; i <= end; i++, bp++) {
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Sat Nov 11 09:04:24 2006 -0800
@@ -69,6 +69,8 @@
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
 int arc_has_callback(arc_buf_t *buf);
+void arc_buf_freeze(arc_buf_t *buf);
+void arc_buf_thaw(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Sat Nov 11 09:04:24 2006 -0800
@@ -196,7 +196,7 @@
 		 * modify (dirty or clean). db_mtx must be held
 		 * before dn_dirty_mtx.
 		 */
-		void *db_data_old[TXG_SIZE];
+		arc_buf_t *db_data_old[TXG_SIZE];
 		blkptr_t *db_overridden_by[TXG_SIZE];
 	} db_d;
 } dmu_buf_impl_t;
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Sat Nov 11 09:04:24 2006 -0800
@@ -252,6 +252,13 @@
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
+#define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+	((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
--- a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h	Sat Nov 11 09:04:24 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,6 +54,7 @@
 #define	ZFS_DEBUG_DBUF_VERIFY	0x0002
 #define	ZFS_DEBUG_DNODE_VERIFY	0x0004
 #define	ZFS_DEBUG_SNAPNAMES	0x0008
+#define	ZFS_DEBUG_MODIFY	0x0010
 
 #ifdef ZFS_DEBUG
 extern void __dprintf(const char *file, const char *func,
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c	Fri Nov 10 20:26:18 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c	Sat Nov 11 09:04:24 2006 -0800
@@ -162,10 +162,7 @@
 		ci->ci_func[byteswap](data, size, &actual_cksum);
 	}
 
-	if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
-	    (actual_cksum.zc_word[1] - zc.zc_word[1]) |
-	    (actual_cksum.zc_word[2] - zc.zc_word[2]) |
-	    (actual_cksum.zc_word[3] - zc.zc_word[3]))
+	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc))
 		return (ECKSUM);
 
 	if (zio_injection_enabled && !zio->io_error)