usr/src/uts/common/fs/zfs/dbuf.c
changeset 10922 e2081f502306
parent 10298 a0d52501437c
child 11539 10d35fc3d7fd
equal deleted inserted replaced
10921:8aac17999e4d 10922:e2081f502306
    36 #include <sys/dmu_zfetch.h>
    36 #include <sys/dmu_zfetch.h>
    37 
    37 
    38 static void dbuf_destroy(dmu_buf_impl_t *db);
    38 static void dbuf_destroy(dmu_buf_impl_t *db);
    39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
    39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
    40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
    40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
    41 static arc_done_func_t dbuf_write_ready;
       
    42 static arc_done_func_t dbuf_write_done;
       
    43 static zio_done_func_t dbuf_skip_write_ready;
       
    44 static zio_done_func_t dbuf_skip_write_done;
       
    45 
    41 
    46 /*
    42 /*
    47  * Global data structures and functions for the dbuf cache.
    43  * Global data structures and functions for the dbuf cache.
    48  */
    44  */
    49 static kmem_cache_t *dbuf_cache;
    45 static kmem_cache_t *dbuf_cache;
   283 #ifdef ZFS_DEBUG
   279 #ifdef ZFS_DEBUG
   284 static void
   280 static void
   285 dbuf_verify(dmu_buf_impl_t *db)
   281 dbuf_verify(dmu_buf_impl_t *db)
   286 {
   282 {
   287 	dnode_t *dn = db->db_dnode;
   283 	dnode_t *dn = db->db_dnode;
       
   284 	dbuf_dirty_record_t *dr;
   288 
   285 
   289 	ASSERT(MUTEX_HELD(&db->db_mtx));
   286 	ASSERT(MUTEX_HELD(&db->db_mtx));
   290 
   287 
   291 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
   288 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
   292 		return;
   289 		return;
   308 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
   305 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
   309 	} else {
   306 	} else {
   310 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
   307 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
   311 	}
   308 	}
   312 
   309 
       
   310 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
       
   311 		ASSERT(dr->dr_dbuf == db);
       
   312 
       
   313 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
       
   314 		ASSERT(dr->dr_dbuf == db);
       
   315 
   313 	/*
   316 	/*
   314 	 * We can't assert that db_size matches dn_datablksz because it
   317 	 * We can't assert that db_size matches dn_datablksz because it
   315 	 * can be momentarily different when another thread is doing
   318 	 * can be momentarily different when another thread is doing
   316 	 * dnode_set_blksz().
   319 	 * dnode_set_blksz().
   317 	 */
   320 	 */
   318 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
   321 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
   319 		dbuf_dirty_record_t *dr = db->db_data_pending;
   322 		dr = db->db_data_pending;
   320 		/*
   323 		/*
   321 		 * It should only be modified in syncing context, so
   324 		 * It should only be modified in syncing context, so
   322 		 * make sure we only have one copy of the data.
   325 		 * make sure we only have one copy of the data.
   323 		 */
   326 		 */
   324 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
   327 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
   503 	mutex_exit(&db->db_mtx);
   506 	mutex_exit(&db->db_mtx);
   504 
   507 
   505 	if (DBUF_IS_L2CACHEABLE(db))
   508 	if (DBUF_IS_L2CACHEABLE(db))
   506 		aflags |= ARC_L2CACHE;
   509 		aflags |= ARC_L2CACHE;
   507 
   510 
   508 	zb.zb_objset = db->db_objset->os_dsl_dataset ?
   511 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
   509 	    db->db_objset->os_dsl_dataset->ds_object : 0;
   512 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
   510 	zb.zb_object = db->db.db_object;
   513 	    db->db.db_object, db->db_level, db->db_blkid);
   511 	zb.zb_level = db->db_level;
       
   512 	zb.zb_blkid = db->db_blkid;
       
   513 
   514 
   514 	dbuf_add_ref(db, NULL);
   515 	dbuf_add_ref(db, NULL);
   515 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
   516 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
   516 
   517 
   517 	if (db->db_parent)
   518 	if (db->db_parent)
   680 
   681 
   681 void
   682 void
   682 dbuf_unoverride(dbuf_dirty_record_t *dr)
   683 dbuf_unoverride(dbuf_dirty_record_t *dr)
   683 {
   684 {
   684 	dmu_buf_impl_t *db = dr->dr_dbuf;
   685 	dmu_buf_impl_t *db = dr->dr_dbuf;
       
   686 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
   685 	uint64_t txg = dr->dr_txg;
   687 	uint64_t txg = dr->dr_txg;
   686 
   688 
   687 	ASSERT(MUTEX_HELD(&db->db_mtx));
   689 	ASSERT(MUTEX_HELD(&db->db_mtx));
   688 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
   690 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
   689 	ASSERT(db->db_level == 0);
   691 	ASSERT(db->db_level == 0);
   690 
   692 
   691 	if (db->db_blkid == DB_BONUS_BLKID ||
   693 	if (db->db_blkid == DB_BONUS_BLKID ||
   692 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
   694 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
   693 		return;
   695 		return;
   694 
   696 
       
   697 	ASSERT(db->db_data_pending != dr);
       
   698 
   695 	/* free this block */
   699 	/* free this block */
   696 	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
   700 	if (!BP_IS_HOLE(bp))
   697 		/* XXX can get silent EIO here */
   701 		dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
   698 		(void) dsl_free(NULL,
   702 
   699 		    spa_get_dsl(db->db_dnode->dn_objset->os_spa),
       
   700 		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
       
   701 	}
       
   702 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
   703 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
   703 	/*
   704 	/*
   704 	 * Release the already-written buffer, so we leave it in
   705 	 * Release the already-written buffer, so we leave it in
   705 	 * a consistent dirty state.  Note that all callers are
   706 	 * a consistent dirty state.  Note that all callers are
   706 	 * modifying the buffer, so they will immediately do
   707 	 * modifying the buffer, so they will immediately do
   959 			/*
   960 			/*
   960 			 * If this buffer has already been written out,
   961 			 * If this buffer has already been written out,
   961 			 * we now need to reset its state.
   962 			 * we now need to reset its state.
   962 			 */
   963 			 */
   963 			dbuf_unoverride(dr);
   964 			dbuf_unoverride(dr);
   964 			if (db->db.db_object != DMU_META_DNODE_OBJECT)
   965 			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
       
   966 			    db->db_state != DB_NOFILL)
   965 				arc_buf_thaw(db->db_buf);
   967 				arc_buf_thaw(db->db_buf);
   966 		}
   968 		}
   967 		mutex_exit(&db->db_mtx);
   969 		mutex_exit(&db->db_mtx);
   968 		return (dr);
   970 		return (dr);
   969 	}
   971 	}
   998 	if (db->db_blkid != DB_BONUS_BLKID) {
  1000 	if (db->db_blkid != DB_BONUS_BLKID) {
   999 		/*
  1001 		/*
  1000 		 * Update the accounting.
  1002 		 * Update the accounting.
  1001 		 * Note: we delay "free accounting" until after we drop
  1003 		 * Note: we delay "free accounting" until after we drop
  1002 		 * the db_mtx.  This keeps us from grabbing other locks
  1004 		 * the db_mtx.  This keeps us from grabbing other locks
  1003 		 * (and possibly deadlocking) in bp_get_dasize() while
  1005 		 * (and possibly deadlocking) in bp_get_dsize() while
  1004 		 * also holding the db_mtx.
  1006 		 * also holding the db_mtx.
  1005 		 */
  1007 		 */
  1006 		dnode_willuse_space(dn, db->db.db_size, tx);
  1008 		dnode_willuse_space(dn, db->db.db_size, tx);
  1007 		do_free_accounting = dbuf_block_freeable(db);
  1009 		do_free_accounting = dbuf_block_freeable(db);
  1008 	}
  1010 	}
  1077 		dnode_setdirty(dn, tx);
  1079 		dnode_setdirty(dn, tx);
  1078 		return (dr);
  1080 		return (dr);
  1079 	} else if (do_free_accounting) {
  1081 	} else if (do_free_accounting) {
  1080 		blkptr_t *bp = db->db_blkptr;
  1082 		blkptr_t *bp = db->db_blkptr;
  1081 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
  1083 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
  1082 		    bp_get_dasize(os->os_spa, bp) : db->db.db_size;
  1084 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
  1083 		/*
  1085 		/*
  1084 		 * This is only a guess -- if the dbuf is dirty
  1086 		 * This is only a guess -- if the dbuf is dirty
  1085 		 * in a previous txg, we don't know how much
  1087 		 * in a previous txg, we don't know how much
  1086 		 * space it will use on disk yet.  We should
  1088 		 * space it will use on disk yet.  We should
  1087 		 * really have the struct_rwlock to access
  1089 		 * really have the struct_rwlock to access
  1170 	if (dr == NULL || dr->dr_txg < txg) {
  1172 	if (dr == NULL || dr->dr_txg < txg) {
  1171 		mutex_exit(&db->db_mtx);
  1173 		mutex_exit(&db->db_mtx);
  1172 		return (0);
  1174 		return (0);
  1173 	}
  1175 	}
  1174 	ASSERT(dr->dr_txg == txg);
  1176 	ASSERT(dr->dr_txg == txg);
       
  1177 	ASSERT(dr->dr_dbuf == db);
  1175 
  1178 
  1176 	/*
  1179 	/*
  1177 	 * If this buffer is currently held, we cannot undirty
  1180 	 * If this buffer is currently held, we cannot undirty
  1178 	 * it, since one of the current holders may be in the
  1181 	 * it, since one of the current holders may be in the
  1179 	 * middle of an update.  Note that users of dbuf_undirty()
  1182 	 * middle of an update.  Note that users of dbuf_undirty()
  1229 	db->db_dirtycnt -= 1;
  1232 	db->db_dirtycnt -= 1;
  1230 
  1233 
  1231 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
  1234 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
  1232 		arc_buf_t *buf = db->db_buf;
  1235 		arc_buf_t *buf = db->db_buf;
  1233 
  1236 
  1234 		ASSERT(arc_released(buf));
  1237 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
  1235 		dbuf_set_data(db, NULL);
  1238 		dbuf_set_data(db, NULL);
  1236 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
  1239 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
  1237 		dbuf_evict(db);
  1240 		dbuf_evict(db);
  1238 		return (1);
  1241 		return (1);
  1239 	}
  1242 	}
  1647 	}
  1650 	}
  1648 
  1651 
  1649 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
  1652 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
  1650 		if (bp && !BP_IS_HOLE(bp)) {
  1653 		if (bp && !BP_IS_HOLE(bp)) {
  1651 			arc_buf_t *pbuf;
  1654 			arc_buf_t *pbuf;
       
  1655 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
  1652 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
  1656 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
  1653 			zbookmark_t zb;
  1657 			zbookmark_t zb;
  1654 			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
  1658 
  1655 			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
  1659 			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
  1656 			zb.zb_object = dn->dn_object;
  1660 			    dn->dn_object, 0, blkid);
  1657 			zb.zb_level = 0;
       
  1658 			zb.zb_blkid = blkid;
       
  1659 
  1661 
  1660 			if (db)
  1662 			if (db)
  1661 				pbuf = db->db_buf;
  1663 				pbuf = db->db_buf;
  1662 			else
  1664 			else
  1663 				pbuf = dn->dn_objset->os_phys_buf;
  1665 				pbuf = dn->dn_objset->os_phys_buf;
  1799 
  1801 
  1800 #pragma weak dmu_buf_rele = dbuf_rele
  1802 #pragma weak dmu_buf_rele = dbuf_rele
  1801 void
  1803 void
  1802 dbuf_rele(dmu_buf_impl_t *db, void *tag)
  1804 dbuf_rele(dmu_buf_impl_t *db, void *tag)
  1803 {
  1805 {
       
  1806 	mutex_enter(&db->db_mtx);
       
  1807 	dbuf_rele_and_unlock(db, tag);
       
  1808 }
       
  1809 
       
  1810 /*
       
  1811  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
       
  1812  * db_dirtycnt and db_holds to be updated atomically.
       
  1813  */
       
  1814 void
       
  1815 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
       
  1816 {
  1804 	int64_t holds;
  1817 	int64_t holds;
  1805 
  1818 
  1806 	mutex_enter(&db->db_mtx);
  1819 	ASSERT(MUTEX_HELD(&db->db_mtx));
  1807 	DBUF_VERIFY(db);
  1820 	DBUF_VERIFY(db);
  1808 
  1821 
  1809 	holds = refcount_remove(&db->db_holds, tag);
  1822 	holds = refcount_remove(&db->db_holds, tag);
  1810 	ASSERT(holds >= 0);
  1823 	ASSERT(holds >= 0);
  1811 
  1824 
  2054 		db->db_data_pending = NULL;
  2067 		db->db_data_pending = NULL;
  2055 		drp = &db->db_last_dirty;
  2068 		drp = &db->db_last_dirty;
  2056 		while (*drp != dr)
  2069 		while (*drp != dr)
  2057 			drp = &(*drp)->dr_next;
  2070 			drp = &(*drp)->dr_next;
  2058 		ASSERT(dr->dr_next == NULL);
  2071 		ASSERT(dr->dr_next == NULL);
       
  2072 		ASSERT(dr->dr_dbuf == db);
  2059 		*drp = dr->dr_next;
  2073 		*drp = dr->dr_next;
  2060 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
  2074 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
  2061 		ASSERT(db->db_dirtycnt > 0);
  2075 		ASSERT(db->db_dirtycnt > 0);
  2062 		db->db_dirtycnt -= 1;
  2076 		db->db_dirtycnt -= 1;
  2063 		mutex_exit(&db->db_mtx);
  2077 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
  2064 		dbuf_rele(db, (void *)(uintptr_t)txg);
       
  2065 		return;
  2078 		return;
  2066 	}
  2079 	}
  2067 
  2080 
  2068 	/*
  2081 	/*
  2069 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
  2082 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
  2081 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
  2094 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
  2082 		cv_wait(&db->db_changed, &db->db_mtx);
  2095 		cv_wait(&db->db_changed, &db->db_mtx);
  2083 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
  2096 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
  2084 	}
  2097 	}
  2085 
  2098 
  2086 	/*
       
  2087 	 * If this dbuf has already been written out via an immediate write,
       
  2088 	 * just complete the write by copying over the new block pointer and
       
  2089 	 * updating the accounting via the write-completion functions.
       
  2090 	 */
       
  2091 	if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
       
  2092 		zio_t zio_fake;
       
  2093 
       
  2094 		zio_fake.io_private = &db;
       
  2095 		zio_fake.io_error = 0;
       
  2096 		zio_fake.io_bp = db->db_blkptr;
       
  2097 		zio_fake.io_bp_orig = *db->db_blkptr;
       
  2098 		zio_fake.io_txg = txg;
       
  2099 		zio_fake.io_flags = 0;
       
  2100 
       
  2101 		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
       
  2102 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
       
  2103 		db->db_data_pending = dr;
       
  2104 		dr->dr_zio = &zio_fake;
       
  2105 		mutex_exit(&db->db_mtx);
       
  2106 
       
  2107 		ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
       
  2108 		    BP_IDENTITY(&zio_fake.io_bp_orig)) ||
       
  2109 		    BP_IS_HOLE(zio_fake.io_bp));
       
  2110 
       
  2111 		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
       
  2112 			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
       
  2113 			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
       
  2114 
       
  2115 		dbuf_write_ready(&zio_fake, db->db_buf, db);
       
  2116 		dbuf_write_done(&zio_fake, db->db_buf, db);
       
  2117 
       
  2118 		return;
       
  2119 	}
       
  2120 
       
  2121 	if (db->db_state != DB_NOFILL &&
  2099 	if (db->db_state != DB_NOFILL &&
  2122 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
  2100 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
  2123 	    refcount_count(&db->db_holds) > 1 &&
  2101 	    refcount_count(&db->db_holds) > 1 &&
       
  2102 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
  2124 	    *datap == db->db_buf) {
  2103 	    *datap == db->db_buf) {
  2125 		/*
  2104 		/*
  2126 		 * If this buffer is currently "in use" (i.e., there
  2105 		 * If this buffer is currently "in use" (i.e., there
  2127 		 * are active holds and db_data still references it),
  2106 		 * are active holds and db_data still references it),
  2128 		 * then make a copy before we start the write so that
  2107 		 * then make a copy before we start the write so that
  2175 		else
  2154 		else
  2176 			dbuf_sync_leaf(dr, tx);
  2155 			dbuf_sync_leaf(dr, tx);
  2177 	}
  2156 	}
  2178 }
  2157 }
  2179 
  2158 
       
  2159 /* ARGSUSED */
       
  2160 static void
       
  2161 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
       
  2162 {
       
  2163 	dmu_buf_impl_t *db = vdb;
       
  2164 	blkptr_t *bp = zio->io_bp;
       
  2165 	blkptr_t *bp_orig = &zio->io_bp_orig;
       
  2166 	dnode_t *dn = db->db_dnode;
       
  2167 	spa_t *spa = zio->io_spa;
       
  2168 	int64_t delta;
       
  2169 	uint64_t fill = 0;
       
  2170 	int i;
       
  2171 
       
  2172 	ASSERT(db->db_blkptr == bp);
       
  2173 
       
  2174 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
       
  2175 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
       
  2176 	zio->io_prev_space_delta = delta;
       
  2177 
       
  2178 	if (BP_IS_HOLE(bp)) {
       
  2179 		ASSERT(bp->blk_fill == 0);
       
  2180 		return;
       
  2181 	}
       
  2182 
       
  2183 	ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
       
  2184 	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
       
  2185 
       
  2186 	mutex_enter(&db->db_mtx);
       
  2187 
       
  2188 	if (db->db_level == 0) {
       
  2189 		mutex_enter(&dn->dn_mtx);
       
  2190 		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
       
  2191 			dn->dn_phys->dn_maxblkid = db->db_blkid;
       
  2192 		mutex_exit(&dn->dn_mtx);
       
  2193 
       
  2194 		if (dn->dn_type == DMU_OT_DNODE) {
       
  2195 			dnode_phys_t *dnp = db->db.db_data;
       
  2196 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
       
  2197 			    i--, dnp++) {
       
  2198 				if (dnp->dn_type != DMU_OT_NONE)
       
  2199 					fill++;
       
  2200 			}
       
  2201 		} else {
       
  2202 			fill = 1;
       
  2203 		}
       
  2204 	} else {
       
  2205 		blkptr_t *ibp = db->db.db_data;
       
  2206 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
       
  2207 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
       
  2208 			if (BP_IS_HOLE(ibp))
       
  2209 				continue;
       
  2210 			ASSERT3U(BP_GET_LSIZE(ibp), ==,
       
  2211 			    db->db_level == 1 ? dn->dn_datablksz :
       
  2212 			    (1<<dn->dn_phys->dn_indblkshift));
       
  2213 			fill += ibp->blk_fill;
       
  2214 		}
       
  2215 	}
       
  2216 
       
  2217 	bp->blk_fill = fill;
       
  2218 
       
  2219 	mutex_exit(&db->db_mtx);
       
  2220 }
       
  2221 
       
  2222 /* ARGSUSED */
       
  2223 static void
       
  2224 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
       
  2225 {
       
  2226 	dmu_buf_impl_t *db = vdb;
       
  2227 	blkptr_t *bp = zio->io_bp;
       
  2228 	blkptr_t *bp_orig = &zio->io_bp_orig;
       
  2229 	dnode_t *dn = db->db_dnode;
       
  2230 	objset_t *os = dn->dn_objset;
       
  2231 	uint64_t txg = zio->io_txg;
       
  2232 	dbuf_dirty_record_t **drp, *dr;
       
  2233 
       
  2234 	ASSERT3U(zio->io_error, ==, 0);
       
  2235 	ASSERT(db->db_blkptr == bp);
       
  2236 
       
  2237 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
       
  2238 		ASSERT(BP_EQUAL(bp, bp_orig));
       
  2239 	} else {
       
  2240 		dsl_dataset_t *ds = os->os_dsl_dataset;
       
  2241 		dmu_tx_t *tx = os->os_synctx;
       
  2242 
       
  2243 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
       
  2244 		dsl_dataset_block_born(ds, bp, tx);
       
  2245 	}
       
  2246 
       
  2247 	mutex_enter(&db->db_mtx);
       
  2248 
       
  2249 	DBUF_VERIFY(db);
       
  2250 
       
  2251 	drp = &db->db_last_dirty;
       
  2252 	while ((dr = *drp) != db->db_data_pending)
       
  2253 		drp = &dr->dr_next;
       
  2254 	ASSERT(!list_link_active(&dr->dr_dirty_node));
       
  2255 	ASSERT(dr->dr_txg == txg);
       
  2256 	ASSERT(dr->dr_dbuf == db);
       
  2257 	ASSERT(dr->dr_next == NULL);
       
  2258 	*drp = dr->dr_next;
       
  2259 
       
  2260 	if (db->db_level == 0) {
       
  2261 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
       
  2262 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
       
  2263 		if (db->db_state != DB_NOFILL) {
       
  2264 			if (dr->dt.dl.dr_data != db->db_buf)
       
  2265 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
       
  2266 				    db) == 1);
       
  2267 			else if (!arc_released(db->db_buf))
       
  2268 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
       
  2269 		}
       
  2270 	} else {
       
  2271 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
       
  2272 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
       
  2273 		if (!BP_IS_HOLE(db->db_blkptr)) {
       
  2274 			int epbs =
       
  2275 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
       
  2276 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
       
  2277 			    db->db.db_size);
       
  2278 			ASSERT3U(dn->dn_phys->dn_maxblkid
       
  2279 			    >> (db->db_level * epbs), >=, db->db_blkid);
       
  2280 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
       
  2281 		}
       
  2282 		mutex_destroy(&dr->dt.di.dr_mtx);
       
  2283 		list_destroy(&dr->dt.di.dr_children);
       
  2284 	}
       
  2285 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
       
  2286 
       
  2287 	cv_broadcast(&db->db_changed);
       
  2288 	ASSERT(db->db_dirtycnt > 0);
       
  2289 	db->db_dirtycnt -= 1;
       
  2290 	db->db_data_pending = NULL;
       
  2291 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
       
  2292 }
       
  2293 
       
  2294 static void
       
  2295 dbuf_write_nofill_ready(zio_t *zio)
       
  2296 {
       
  2297 	dbuf_write_ready(zio, NULL, zio->io_private);
       
  2298 }
       
  2299 
       
  2300 static void
       
  2301 dbuf_write_nofill_done(zio_t *zio)
       
  2302 {
       
  2303 	dbuf_write_done(zio, NULL, zio->io_private);
       
  2304 }
       
  2305 
       
  2306 static void
       
  2307 dbuf_write_override_ready(zio_t *zio)
       
  2308 {
       
  2309 	dbuf_dirty_record_t *dr = zio->io_private;
       
  2310 	dmu_buf_impl_t *db = dr->dr_dbuf;
       
  2311 
       
  2312 	dbuf_write_ready(zio, NULL, db);
       
  2313 }
       
  2314 
       
  2315 static void
       
  2316 dbuf_write_override_done(zio_t *zio)
       
  2317 {
       
  2318 	dbuf_dirty_record_t *dr = zio->io_private;
       
  2319 	dmu_buf_impl_t *db = dr->dr_dbuf;
       
  2320 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
       
  2321 
       
  2322 	mutex_enter(&db->db_mtx);
       
  2323 	if (!BP_EQUAL(zio->io_bp, obp)) {
       
  2324 		if (!BP_IS_HOLE(obp))
       
  2325 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
       
  2326 		arc_release(dr->dt.dl.dr_data, db);
       
  2327 	}
       
  2328 	mutex_exit(&db->db_mtx);
       
  2329 
       
  2330 	dbuf_write_done(zio, NULL, db);
       
  2331 }
       
  2332 
  2180 static void
  2333 static void
  2181 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
  2334 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
  2182 {
  2335 {
  2183 	dmu_buf_impl_t *db = dr->dr_dbuf;
  2336 	dmu_buf_impl_t *db = dr->dr_dbuf;
  2184 	dnode_t *dn = db->db_dnode;
  2337 	dnode_t *dn = db->db_dnode;
  2185 	objset_t *os = dn->dn_objset;
  2338 	objset_t *os = dn->dn_objset;
  2186 	dmu_buf_impl_t *parent = db->db_parent;
  2339 	dmu_buf_impl_t *parent = db->db_parent;
  2187 	uint64_t txg = tx->tx_txg;
  2340 	uint64_t txg = tx->tx_txg;
  2188 	zbookmark_t zb;
  2341 	zbookmark_t zb;
  2189 	writeprops_t wp = { 0 };
  2342 	zio_prop_t zp;
  2190 	zio_t *zio;
  2343 	zio_t *zio;
  2191 
  2344 
  2192 	if (!BP_IS_HOLE(db->db_blkptr) &&
  2345 	if (db->db_state != DB_NOFILL) {
  2193 	    (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
  2346 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
  2194 		/*
  2347 			/*
  2195 		 * Private object buffers are released here rather
  2348 			 * Private object buffers are released here rather
  2196 		 * than in dbuf_dirty() since they are only modified
  2349 			 * than in dbuf_dirty() since they are only modified
  2197 		 * in the syncing context and we don't want the
  2350 			 * in the syncing context and we don't want the
  2198 		 * overhead of making multiple copies of the data.
  2351 			 * overhead of making multiple copies of the data.
  2199 		 */
  2352 			 */
  2200 		arc_release(data, db);
  2353 			if (BP_IS_HOLE(db->db_blkptr)) {
  2201 	} else if (db->db_state != DB_NOFILL) {
  2354 				arc_buf_thaw(data);
  2202 		ASSERT(arc_released(data));
  2355 			} else {
  2203 		/* XXX why do we need to thaw here? */
  2356 				arc_release(data, db);
  2204 		arc_buf_thaw(data);
  2357 			}
       
  2358 		}
  2205 	}
  2359 	}
  2206 
  2360 
  2207 	if (parent != dn->dn_dbuf) {
  2361 	if (parent != dn->dn_dbuf) {
  2208 		ASSERT(parent && parent->db_data_pending);
  2362 		ASSERT(parent && parent->db_data_pending);
  2209 		ASSERT(db->db_level == parent->db_level-1);
  2363 		ASSERT(db->db_level == parent->db_level-1);
  2218 
  2372 
  2219 	ASSERT(db->db_level == 0 || data == db->db_buf);
  2373 	ASSERT(db->db_level == 0 || data == db->db_buf);
  2220 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
  2374 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
  2221 	ASSERT(zio);
  2375 	ASSERT(zio);
  2222 
  2376 
  2223 	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
  2377 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
  2224 	zb.zb_object = db->db.db_object;
  2378 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
  2225 	zb.zb_level = db->db_level;
  2379 	    db->db.db_object, db->db_level, db->db_blkid);
  2226 	zb.zb_blkid = db->db_blkid;
  2380 
  2227 
  2381 	dmu_write_policy(os, dn, db->db_level,
  2228 	wp.wp_type = dn->dn_type;
  2382 	    db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
  2229 	wp.wp_level = db->db_level;
  2383 
  2230 	wp.wp_copies = os->os_copies;
  2384 	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
  2231 	wp.wp_dncompress = dn->dn_compress;
  2385 		ASSERT(db->db_state != DB_NOFILL);
  2232 	wp.wp_oscompress = os->os_compress;
  2386 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
  2233 	wp.wp_dnchecksum = dn->dn_checksum;
  2387 		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
  2234 	wp.wp_oschecksum = os->os_checksum;
  2388 		    dbuf_write_override_ready, dbuf_write_override_done, dr,
  2235 
  2389 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
  2236 	if (BP_IS_OLDER(db->db_blkptr, txg))
  2390 		mutex_enter(&db->db_mtx);
  2237 		(void) dsl_dataset_block_kill(
  2391 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
  2238 		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
  2392 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
  2239 
  2393 		    dr->dt.dl.dr_copies);
  2240 	if (db->db_state == DB_NOFILL) {
  2394 		mutex_exit(&db->db_mtx);
  2241 		zio_prop_t zp = { 0 };
  2395 	} else if (db->db_state == DB_NOFILL) {
  2242 
  2396 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
  2243 		write_policy(os->os_spa, &wp, &zp);
  2397 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
  2244 		dr->dr_zio = zio_write(zio, os->os_spa,
  2398 		    db->db_blkptr, NULL, db->db.db_size, &zp,
  2245 		    txg, db->db_blkptr, NULL,
  2399 		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
  2246 		    db->db.db_size, &zp, dbuf_skip_write_ready,
  2400 		    ZIO_PRIORITY_ASYNC_WRITE,
  2247 		    dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
  2401 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
  2248 		    ZIO_FLAG_MUSTSUCCEED, &zb);
       
  2249 	} else {
  2402 	} else {
  2250 		dr->dr_zio = arc_write(zio, os->os_spa, &wp,
  2403 		ASSERT(arc_released(data));
  2251 		    DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
  2404 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
  2252 		    data, dbuf_write_ready, dbuf_write_done, db,
  2405 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
       
  2406 		    dbuf_write_ready, dbuf_write_done, db,
  2253 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
  2407 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
  2254 	}
  2408 	}
  2255 }
  2409 }
  2256 
       
  2257 /* wrapper function for dbuf_write_ready bypassing ARC */
       
  2258 static void
       
  2259 dbuf_skip_write_ready(zio_t *zio)
       
  2260 {
       
  2261 	blkptr_t *bp = zio->io_bp;
       
  2262 
       
  2263 	if (!BP_IS_GANG(bp))
       
  2264 		zio_skip_write(zio);
       
  2265 
       
  2266 	dbuf_write_ready(zio, NULL, zio->io_private);
       
  2267 }
       
  2268 
       
  2269 /* wrapper function for dbuf_write_done bypassing ARC */
       
  2270 static void
       
  2271 dbuf_skip_write_done(zio_t *zio)
       
  2272 {
       
  2273 	dbuf_write_done(zio, NULL, zio->io_private);
       
  2274 }
       
  2275 
       
  2276 /* ARGSUSED */
       
  2277 static void
       
  2278 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
       
  2279 {
       
  2280 	dmu_buf_impl_t *db = vdb;
       
  2281 	dnode_t *dn = db->db_dnode;
       
  2282 	objset_t *os = dn->dn_objset;
       
  2283 	blkptr_t *bp = zio->io_bp;
       
  2284 	blkptr_t *bp_orig = &zio->io_bp_orig;
       
  2285 	uint64_t fill = 0;
       
  2286 	int old_size, new_size, i;
       
  2287 
       
  2288 	ASSERT(db->db_blkptr == bp);
       
  2289 
       
  2290 	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
       
  2291 
       
  2292 	old_size = bp_get_dasize(os->os_spa, bp_orig);
       
  2293 	new_size = bp_get_dasize(os->os_spa, bp);
       
  2294 
       
  2295 	dnode_diduse_space(dn, new_size - old_size);
       
  2296 
       
  2297 	if (BP_IS_HOLE(bp)) {
       
  2298 		dsl_dataset_t *ds = os->os_dsl_dataset;
       
  2299 		dmu_tx_t *tx = os->os_synctx;
       
  2300 
       
  2301 		if (bp_orig->blk_birth == tx->tx_txg)
       
  2302 			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
       
  2303 		ASSERT3U(bp->blk_fill, ==, 0);
       
  2304 		return;
       
  2305 	}
       
  2306 
       
  2307 	ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
       
  2308 	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
       
  2309 
       
  2310 	mutex_enter(&db->db_mtx);
       
  2311 
       
  2312 	if (db->db_level == 0) {
       
  2313 		mutex_enter(&dn->dn_mtx);
       
  2314 		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
       
  2315 			dn->dn_phys->dn_maxblkid = db->db_blkid;
       
  2316 		mutex_exit(&dn->dn_mtx);
       
  2317 
       
  2318 		if (dn->dn_type == DMU_OT_DNODE) {
       
  2319 			dnode_phys_t *dnp = db->db.db_data;
       
  2320 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
       
  2321 			    i--, dnp++) {
       
  2322 				if (dnp->dn_type != DMU_OT_NONE)
       
  2323 					fill++;
       
  2324 			}
       
  2325 		} else {
       
  2326 			fill = 1;
       
  2327 		}
       
  2328 	} else {
       
  2329 		blkptr_t *ibp = db->db.db_data;
       
  2330 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
       
  2331 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
       
  2332 			if (BP_IS_HOLE(ibp))
       
  2333 				continue;
       
  2334 			ASSERT3U(BP_GET_LSIZE(ibp), ==,
       
  2335 			    db->db_level == 1 ? dn->dn_datablksz :
       
  2336 			    (1<<dn->dn_phys->dn_indblkshift));
       
  2337 			fill += ibp->blk_fill;
       
  2338 		}
       
  2339 	}
       
  2340 
       
  2341 	bp->blk_fill = fill;
       
  2342 
       
  2343 	mutex_exit(&db->db_mtx);
       
  2344 
       
  2345 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
       
  2346 		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
       
  2347 	} else {
       
  2348 		dsl_dataset_t *ds = os->os_dsl_dataset;
       
  2349 		dmu_tx_t *tx = os->os_synctx;
       
  2350 
       
  2351 		if (bp_orig->blk_birth == tx->tx_txg)
       
  2352 			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
       
  2353 		dsl_dataset_block_born(ds, bp, tx);
       
  2354 	}
       
  2355 }
       
  2356 
       
  2357 /* ARGSUSED */
       
  2358 static void
       
  2359 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
       
  2360 {
       
  2361 	dmu_buf_impl_t *db = vdb;
       
  2362 	uint64_t txg = zio->io_txg;
       
  2363 	dbuf_dirty_record_t **drp, *dr;
       
  2364 
       
  2365 	ASSERT3U(zio->io_error, ==, 0);
       
  2366 
       
  2367 	mutex_enter(&db->db_mtx);
       
  2368 
       
  2369 	drp = &db->db_last_dirty;
       
  2370 	while ((dr = *drp) != db->db_data_pending)
       
  2371 		drp = &dr->dr_next;
       
  2372 	ASSERT(!list_link_active(&dr->dr_dirty_node));
       
  2373 	ASSERT(dr->dr_txg == txg);
       
  2374 	ASSERT(dr->dr_next == NULL);
       
  2375 	*drp = dr->dr_next;
       
  2376 
       
  2377 	if (db->db_level == 0) {
       
  2378 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
       
  2379 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
       
  2380 
       
  2381 		if (db->db_state != DB_NOFILL) {
       
  2382 			if (dr->dt.dl.dr_data != db->db_buf)
       
  2383 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
       
  2384 				    db) == 1);
       
  2385 			else if (!BP_IS_HOLE(db->db_blkptr))
       
  2386 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
       
  2387 			else
       
  2388 				ASSERT(arc_released(db->db_buf));
       
  2389 		}
       
  2390 	} else {
       
  2391 		dnode_t *dn = db->db_dnode;
       
  2392 
       
  2393 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
       
  2394 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
       
  2395 		if (!BP_IS_HOLE(db->db_blkptr)) {
       
  2396 			int epbs =
       
  2397 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
       
  2398 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
       
  2399 			    db->db.db_size);
       
  2400 			ASSERT3U(dn->dn_phys->dn_maxblkid
       
  2401 			    >> (db->db_level * epbs), >=, db->db_blkid);
       
  2402 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
       
  2403 		}
       
  2404 		mutex_destroy(&dr->dt.di.dr_mtx);
       
  2405 		list_destroy(&dr->dt.di.dr_children);
       
  2406 	}
       
  2407 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
       
  2408 
       
  2409 	cv_broadcast(&db->db_changed);
       
  2410 	ASSERT(db->db_dirtycnt > 0);
       
  2411 	db->db_dirtycnt -= 1;
       
  2412 	db->db_data_pending = NULL;
       
  2413 	mutex_exit(&db->db_mtx);
       
  2414 
       
  2415 	dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
       
  2416 
       
  2417 	dbuf_rele(db, (void *)(uintptr_t)txg);
       
  2418 }