--- a/usr/src/uts/common/fs/zfs/arc.c Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c Fri Oct 17 16:50:52 2008 -0600
@@ -3077,7 +3077,7 @@
kmem_free(callback, sizeof (arc_write_callback_t));
}
-static void
+void
write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
{
boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
--- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Fri Oct 17 16:50:52 2008 -0600
@@ -40,6 +40,8 @@
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static arc_done_func_t dbuf_write_ready;
static arc_done_func_t dbuf_write_done;
+static zio_done_func_t dbuf_skip_write_ready;
+static zio_done_func_t dbuf_skip_write_done;
/*
* Global data structures and functions for the dbuf cache.
@@ -396,7 +398,8 @@
} else {
dbuf_evict_user(db);
db->db.db_data = NULL;
- db->db_state = DB_UNCACHED;
+ if (db->db_state != DB_NOFILL)
+ db->db_state = DB_UNCACHED;
}
}
@@ -537,6 +540,9 @@
*/
ASSERT(!refcount_is_zero(&db->db_holds));
+ if (db->db_state == DB_NOFILL)
+ return (EIO);
+
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
@@ -612,6 +618,8 @@
dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
db->db.db_size, db, type));
db->db_state = DB_FILL;
+ } else if (db->db_state == DB_NOFILL) {
+ dbuf_set_data(db, NULL);
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
}
@@ -755,6 +763,7 @@
mutex_enter(&db->db_mtx);
if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL ||
db->db_state == DB_EVICTING) {
ASSERT(db->db.db_data == NULL);
mutex_exit(&db->db_mtx);
@@ -924,7 +933,8 @@
* syncing context don't bother holding ahead.
*/
ASSERT(db->db_level != 0 ||
- db->db_state == DB_CACHED || db->db_state == DB_FILL);
+ db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+ db->db_state == DB_NOFILL);
mutex_enter(&dn->dn_mtx);
/*
@@ -1011,22 +1021,26 @@
if (db->db_level == 0) {
void *data_old = db->db_buf;
- if (db->db_blkid == DB_BONUS_BLKID) {
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db.db_data;
- } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
- /*
- * Release the data buffer from the cache so that we
- * can modify it without impacting possible other users
- * of this cached data block. Note that indirect
- * blocks and private objects are not released until the
- * syncing state (since they are only modified then).
- */
- arc_release(db->db_buf, db);
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db_buf;
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so
+ * that we can modify it without impacting
+ * possible other users of this cached data
+ * block. Note that indirect blocks and
+ * private objects are not released until the
+ * syncing state (since they are only modified
+ * then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
}
- ASSERT(data_old != NULL);
dr->dt.dl.dr_data = data_old;
} else {
mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -1199,12 +1213,15 @@
}
if (db->db_level == 0) {
- dbuf_unoverride(dr);
+ if (db->db_state != DB_NOFILL) {
+ dbuf_unoverride(dr);
- ASSERT(db->db_buf != NULL);
- ASSERT(dr->dt.dl.dr_data != NULL);
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ }
} else {
ASSERT(db->db_buf != NULL);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -1246,6 +1263,16 @@
}
void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_state = DB_NOFILL;
+
+ dmu_buf_will_fill(db_fake, tx);
+}
+
+void
dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -1320,7 +1347,7 @@
db->db_state = DB_UNCACHED;
}
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_data_pending == NULL);
db->db_state = DB_EVICTING;
@@ -1745,7 +1772,8 @@
* This is a special case: we never associated this
* dbuf with any data allocated from the ARC.
*/
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
dbuf_evict(db);
} else if (arc_released(db->db_buf)) {
arc_buf_t *buf = db->db_buf;
@@ -1933,7 +1961,7 @@
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else {
- ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
DBUF_VERIFY(db);
@@ -2021,26 +2049,33 @@
return;
}
- blksz = arc_buf_size(*datap);
+ if (db->db_state != DB_NOFILL) {
+ blksz = arc_buf_size(*datap);
- if (dn->dn_object != DMU_META_DNODE_OBJECT) {
- /*
- * If this buffer is currently "in use" (i.e., there are
- * active holds and db_data still references it), then make
- * a copy before we start the write so that any modifications
- * from the open txg will not leak into this write.
- *
- * NOTE: this copy does not need to be made for objects only
- * modified in the syncing context (e.g. DNONE_DNODE blocks).
- */
- if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
- bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * If this buffer is currently "in use" (i.e., there
+ * are active holds and db_data still references it),
+ * then make a copy before we start the write so that
+ * any modifications from the open txg will not leak
+ * into this write.
+ *
+ * NOTE: this copy does not need to be made for
+ * objects only modified in the syncing context (e.g.
+ * DNONE_DNODE blocks).
+ */
+ if (refcount_count(&db->db_holds) > 1 &&
+ *datap == db->db_buf) {
+ arc_buf_contents_t type =
+ DBUF_GET_BUFC_TYPE(db);
+ *datap =
+ arc_buf_alloc(os->os_spa, blksz, db, type);
+ bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ }
}
+
+ ASSERT(*datap != NULL);
}
-
- ASSERT(*datap != NULL);
db->db_data_pending = dr;
mutex_exit(&db->db_mtx);
@@ -2101,7 +2136,7 @@
* overhead of making multiple copies of the data.
*/
arc_release(data, db);
- } else {
+ } else if (db->db_state != DB_NOFILL) {
ASSERT(arc_released(data));
/* XXX why do we need to thaw here? */
arc_buf_thaw(data);
@@ -2140,10 +2175,40 @@
(void) dsl_dataset_block_kill(
os->os_dsl_dataset, db->db_blkptr, zio, tx);
- dr->dr_zio = arc_write(zio, os->os_spa, &wp,
- DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
- data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ if (db->db_state == DB_NOFILL) {
+ zio_prop_t zp = { 0 };
+
+ write_policy(os->os_spa, &wp, &zp);
+ dr->dr_zio = zio_write(zio, os->os_spa,
+ txg, db->db_blkptr, NULL,
+ db->db.db_size, &zp, dbuf_skip_write_ready,
+ dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED, &zb);
+ } else {
+ dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+ DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
+ data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
+}
+
+/* wrapper function for dbuf_write_ready bypassing ARC */
+static void
+dbuf_skip_write_ready(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (!BP_IS_GANG(bp))
+ zio_skip_write(zio);
+
+ dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+/* wrapper function for dbuf_write_done bypassing ARC */
+static void
+dbuf_skip_write_done(zio_t *zio)
+{
+ dbuf_write_done(zio, NULL, zio->io_private);
}
/* ARGSUSED */
@@ -2251,12 +2316,15 @@
ASSERT(db->db_blkid != DB_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
- else if (!BP_IS_HOLE(db->db_blkptr))
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- else
- ASSERT(arc_released(db->db_buf));
+ if (db->db_state != DB_NOFILL) {
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+ db) == 1);
+ else if (!BP_IS_HOLE(db->db_blkptr))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ else
+ ASSERT(arc_released(db->db_buf));
+ }
} else {
dnode_t *dn = db->db_dnode;
--- a/usr/src/uts/common/fs/zfs/dmu.c Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c Fri Oct 17 16:50:52 2008 -0600
@@ -638,6 +638,27 @@
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ dmu_buf_t *db = dbp[i];
+
+ dmu_buf_will_not_fill(db, tx);
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
#ifdef _KERNEL
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
@@ -991,7 +1012,6 @@
zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-
if (pio) {
zio_nowait(zio);
err = EINPROGRESS;
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Fri Oct 17 16:50:52 2008 -0600
@@ -177,7 +177,6 @@
min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT;
-
/*
* For i/o error checking, read the first and last level-0
* blocks (if they are not aligned), and all the level-1 blocks.
@@ -185,9 +184,12 @@
if (dn) {
if (dn->dn_maxblkid == 0) {
- err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
- if (err)
- goto out;
+ if ((off > 0 || len < dn->dn_datablksz) &&
+ off < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err)
+ goto out;
+ }
} else {
zio_t *zio = zio_root(dn->dn_objset->os_spa,
NULL, NULL, ZIO_FLAG_CANFAIL);
@@ -203,7 +205,7 @@
/* last level-0 block */
end = (off+len-1) >> dn->dn_datablkshift;
- if (end != start &&
+ if (end != start && end <= dn->dn_maxblkid &&
P2PHASE(off+len, dn->dn_datablksz)) {
err = dmu_tx_check_ioerr(zio, dn, 0, end);
if (err)
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h Fri Oct 17 16:50:52 2008 -0600
@@ -94,6 +94,7 @@
uint8_t wp_dnchecksum, wp_oschecksum;
} writeprops_t;
+void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp);
int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Oct 17 16:50:52 2008 -0600
@@ -26,8 +26,6 @@
#ifndef _SYS_DBUF_H
#define _SYS_DBUF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
@@ -55,19 +53,23 @@
#define DB_RF_CACHED (1 << 5)
/*
- * The state transition diagram for dbufs looks like:
+ * The simplified state transition diagram for dbufs looks like:
*
* +----> READ ----+
* | |
* | V
* (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
- * | ^
- * | |
- * +----> FILL ----+
+ * | ^ ^
+ * | | |
+ * +----> FILL ----+ |
+ * | |
+ * | |
+ * +--------> NOFILL -------+
*/
typedef enum dbuf_states {
DB_UNCACHED,
DB_FILL,
+ DB_NOFILL,
DB_READ,
DB_CACHED,
DB_EVICTING
@@ -258,8 +260,8 @@
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Fri Oct 17 16:50:52 2008 -0600
@@ -26,8 +26,6 @@
#ifndef _SYS_DMU_H
#define _SYS_DMU_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file describes the interface that the DMU provides for its
* consumers.
@@ -451,6 +449,8 @@
void *buf);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
+void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h Fri Oct 17 16:50:52 2008 -0600
@@ -337,6 +337,8 @@
void *data, uint64_t size, zio_done_func_t *done, void *private,
int priority, int flags, zbookmark_t *zb);
+extern void zio_skip_write(zio_t *zio);
+
extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_done_func_t *done, void *private, int flags);
--- a/usr/src/uts/common/fs/zfs/zio.c Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c Fri Oct 17 16:50:52 2008 -0600
@@ -512,6 +512,16 @@
return (zio);
}
+void
+zio_skip_write(zio_t *zio)
+{
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_stage == ZIO_STAGE_READY);
+ ASSERT(!BP_IS_GANG(zio->io_bp));
+
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+}
+
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_prop_t *zp,
--- a/usr/src/uts/common/fs/zfs/zvol.c Fri Oct 17 15:36:23 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c Fri Oct 17 16:50:52 2008 -0600
@@ -623,7 +623,6 @@
{
objset_t *os = zv->zv_objset;
dmu_tx_t *tx;
- void *data;
uint64_t refd, avail, usedobjs, availobjs;
uint64_t resid = zv->zv_volsize;
uint64_t off = 0;
@@ -636,9 +635,6 @@
/* Free old extents if they exist */
zvol_free_extents(zv);
- /* allocate the blocks by writing each one */
- data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
-
while (resid != 0) {
int error;
uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
@@ -648,16 +644,14 @@
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- kmem_free(data, SPA_MAXBLOCKSIZE);
(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
return (error);
}
- dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
+ dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
dmu_tx_commit(tx);
off += bytes;
resid -= bytes;
}
- kmem_free(data, SPA_MAXBLOCKSIZE);
txg_wait_synced(dmu_objset_pool(os), 0);
return (0);