6573681 deleting a very large file can be slow
6706950 ((&dnp->dn_blkptr[0])->blk_birth == 0) || list_head(list) != 0L || dn->dn_next_blksz[txgoff]
--- a/usr/src/cmd/zdb/zdb.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/cmd/zdb/zdb.c Tue Jul 01 12:01:12 2008 -0700
@@ -1093,13 +1093,13 @@
}
for (;;) {
- error = dnode_next_offset(dn, B_FALSE, &start, minlvl,
- blkfill, 0);
+ error = dnode_next_offset(dn,
+ 0, &start, minlvl, blkfill, 0);
if (error)
break;
end = start;
- error = dnode_next_offset(dn, B_TRUE, &end, minlvl,
- blkfill, 0);
+ error = dnode_next_offset(dn,
+ DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
nicenum(end - start, segsize);
(void) printf("\t\tsegment [%016llx, %016llx)"
" size %5s\n", (u_longlong_t)start,
--- a/usr/src/uts/common/fs/zfs/dbuf.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Tue Jul 01 12:01:12 2008 -0700
@@ -705,22 +705,50 @@
arc_release(dr->dt.dl.dr_data, db);
}
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks. Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t first_l1 = start >> epbs;
+ uint64_t last_l1 = end >> epbs;
- dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ if (end > dn->dn_maxblkid) {
+ end = dn->dn_maxblkid;
+ last_l1 = end >> epbs;
+ }
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ if (db->db_level == 1 &&
+ db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_last_dirty &&
+ db->db_last_dirty->dr_txg < txg) {
+ dbuf_add_ref(db, FTAG);
+ mutex_exit(&db->db_mtx);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+ }
+
if (db->db_level != 0)
continue;
dprintf_dbuf(db, "found buf %s\n", "");
- if (db->db_blkid < blkid ||
- db->db_blkid >= blkid+nblks)
+ if (db->db_blkid < start || db->db_blkid > end)
continue;
/* found a level 0 buffer in the range */
@@ -1161,7 +1189,7 @@
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
} else if (db->db_level+1 == dn->dn_nlevels) {
- ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
@@ -1976,7 +2004,7 @@
mutex_exit(&db->db_mtx);
if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
&zio_fake.io_bp_orig, dn->dn_zio, tx);
dbuf_write_ready(&zio_fake, db->db_buf, db);
@@ -2105,7 +2133,7 @@
if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
zio_flags |= ZIO_FLAG_METADATA;
if (BP_IS_OLDER(db->db_blkptr, txg))
- dsl_dataset_block_kill(
+ (void) dsl_dataset_block_kill(
os->os_dsl_dataset, db->db_blkptr, zio, tx);
dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
@@ -2137,7 +2165,7 @@
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ (void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
return;
}
@@ -2185,7 +2213,7 @@
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ (void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
dsl_dataset_block_born(ds, zio->io_bp, tx);
}
}
--- a/usr/src/uts/common/fs/zfs/dmu.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -364,6 +364,152 @@
dnode_rele(dn, FTAG);
}
+static int
+get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+{
+ uint64_t len = limit - *offset;
+ uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
+ uint64_t dn_used;
+ int err;
+
+ ASSERT(limit <= *offset);
+
+ dn_used = dn->dn_phys->dn_used <<
+ (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES ? 0 : DEV_BSHIFT);
+ if (len <= chunk_len || dn_used <= chunk_len) {
+ *offset = limit;
+ return (0);
+ }
+
+ while (*offset > limit) {
+ uint64_t initial_offset = *offset;
+ uint64_t delta;
+
+ /* skip over allocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ ASSERT3U(*offset, <=, initial_offset);
+ delta = initial_offset - *offset;
+ if (delta >= chunk_len) {
+ *offset += delta - chunk_len;
+ return (0);
+ }
+ chunk_len -= delta;
+
+ /* skip over unallocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ if (*offset < limit)
+ *offset = limit;
+ ASSERT3U(*offset, <, initial_offset);
+ }
+ return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+ uint64_t length, boolean_t free_dnode)
+{
+ dmu_tx_t *tx;
+ uint64_t object_size, start, end, len;
+ boolean_t trunc = (length == DMU_OBJECT_END);
+ int align, err;
+
+ align = 1 << dn->dn_datablkshift;
+ ASSERT(align > 0);
+ object_size = align == 1 ? dn->dn_datablksz :
+ (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
+
+ if (trunc || (end = offset + length) > object_size)
+ end = object_size;
+ if (end <= offset)
+ return (0);
+ length = end - offset;
+
+ while (length) {
+ start = end;
+ err = get_next_chunk(dn, &start, offset);
+ if (err)
+ return (err);
+ len = trunc ? DMU_OBJECT_END : end - start;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object, start, len);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dnode_free_range(dn, start, trunc ? -1 : len, tx);
+
+ if (start == 0 && trunc && free_dnode)
+ dnode_free(dn, tx);
+
+ length -= end - start;
+
+ dmu_tx_commit(tx);
+ end = start;
+ trunc = FALSE;
+ }
+ return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_object(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_t *tx;
+ int err;
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err != 0)
+ return (err);
+ if (dn->dn_nlevels == 1) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+ dnode_free(dn, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ } else {
+ err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+ }
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
int
dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
@@ -912,7 +1058,7 @@
return (err);
}
- err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+ err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
dnode_rele(dn, FTAG);
return (err);
--- a/usr/src/uts/common/fs/zfs/dmu_object.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -54,7 +54,8 @@
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
int error = dnode_next_offset(osi->os_meta_dnode,
- B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ DNODE_FIND_HOLE,
+ &offset, 2, DNODES_PER_BLOCK >> 2, 0);
restarted = B_TRUE;
if (error == 0)
object = offset >> DNODE_SHIFT;
@@ -139,6 +140,7 @@
return (err);
ASSERT(dn->dn_type != DMU_OT_NONE);
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
@@ -152,7 +154,7 @@
int error;
error = dnode_next_offset(os->os->os_meta_dnode,
- hole, &offset, 0, DNODES_PER_BLOCK, txg);
+ (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
*objectp = offset >> DNODE_SHIFT;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Jul 01 12:01:12 2008 -0700
@@ -829,7 +829,7 @@
if (!DVA_EQUAL(BP_IDENTITY(bp),
BP_IDENTITY(&zio->io_bp_orig))) {
if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- dsl_dataset_block_kill(os->os_dsl_dataset,
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
&zio->io_bp_orig, NULL, os->os_synctx);
dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
}
@@ -878,7 +878,7 @@
zb.zb_level = -1;
zb.zb_blkid = 0;
if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
- dsl_dataset_block_kill(os->os_dsl_dataset,
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
os->os_rootbp, pio, tx);
}
zio = arc_write(pio, os->os_spa, os->os_md_checksum,
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c Tue Jul 01 12:01:12 2008 -0700
@@ -877,23 +877,14 @@
for (obj = drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
(void) dmu_object_next(os, &obj, FALSE, 0)) {
- dmu_tx_t *tx;
int err;
if (dmu_object_info(os, obj, NULL) != 0)
continue;
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
+ err = dmu_free_object(os, obj);
+ if (err)
return (err);
- }
- err = dmu_object_free(os, obj, tx);
- dmu_tx_commit(tx);
- if (err && err != ENOENT)
- return (EINVAL);
}
return (0);
}
@@ -939,7 +930,6 @@
restore_free(struct restorearg *ra, objset_t *os,
struct drr_free *drrf)
{
- dmu_tx_t *tx;
int err;
if (drrf->drr_length != -1ULL &&
@@ -949,18 +939,8 @@
if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
return (EINVAL);
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_free(tx, drrf->drr_object,
+ err = dmu_free_long_range(os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_free_range(os, drrf->drr_object,
- drrf->drr_offset, drrf->drr_length, tx);
- dmu_tx_commit(tx);
return (err);
}
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Jul 01 12:01:12 2008 -0700
@@ -320,39 +320,25 @@
static void
dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
- uint64_t blkid, nblks;
- uint64_t space = 0, unref = 0;
+ uint64_t blkid, nblks, lastblk;
+ uint64_t space = 0, unref = 0, skipped = 0;
dnode_t *dn = txh->txh_dnode;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- int dirty;
+ int epbs;
- /*
- * We don't need to use any locking to check for dirtyness
- * because it's OK if we get stale data -- the dnode may become
- * dirty immediately after our check anyway. This is just a
- * means to avoid the expensive count when we aren't sure we
- * need it. We need to be able to deal with a dirty dnode.
- */
- dirty = list_link_active(&dn->dn_dirty_link[0]) |
- list_link_active(&dn->dn_dirty_link[1]) |
- list_link_active(&dn->dn_dirty_link[2]) |
- list_link_active(&dn->dn_dirty_link[3]);
- if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+ if (dn->dn_nlevels == 0)
return;
/*
- * the struct_rwlock protects us against dn_phys->dn_nlevels
+ * The struct_rwlock protects us against dn_nlevels
* changing, in case (against all odds) we manage to dirty &
* sync out the changes after we check for being dirty.
- * also, dbuf_hold_impl() wants us to have the struct_rwlock.
- *
- * It's fine to use dn_datablkshift rather than the dn_phys
- * equivalent because if it is changing, maxblkid==0 and we will
- * bail.
+ * Also, dbuf_hold_level() wants us to have the struct_rwlock.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_phys->dn_maxblkid == 0) {
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ if (dn->dn_maxblkid == 0) {
if (off == 0 && len >= dn->dn_datablksz) {
blkid = 0;
nblks = 1;
@@ -362,24 +348,21 @@
}
} else {
blkid = off >> dn->dn_datablkshift;
- nblks = (off + len) >> dn->dn_datablkshift;
+ nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
- if (blkid >= dn->dn_phys->dn_maxblkid) {
+ if (blkid >= dn->dn_maxblkid) {
rw_exit(&dn->dn_struct_rwlock);
return;
}
- if (blkid + nblks > dn->dn_phys->dn_maxblkid)
- nblks = dn->dn_phys->dn_maxblkid - blkid;
+ if (blkid + nblks > dn->dn_maxblkid)
+ nblks = dn->dn_maxblkid - blkid;
- /* don't bother after 128,000 blocks */
- nblks = MIN(nblks, 128*1024);
}
-
- if (dn->dn_phys->dn_nlevels == 1) {
+ if (dn->dn_nlevels == 1) {
int i;
for (i = 0; i < nblks; i++) {
blkptr_t *bp = dn->dn_phys->dn_blkptr;
- ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+ ASSERT3U(blkid + i, <, dn->dn_nblkptr);
bp += blkid + i;
if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
dprintf_bp(bp, "can free old%s", "");
@@ -390,51 +373,86 @@
nblks = 0;
}
+ /*
+ * Add in memory requirements of higher-level indirects
+ */
+ if (nblks && dn->dn_nlevels > 2) {
+ uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
+ int level = 2;
+
+ while (level++ < dn->dn_nlevels) {
+ txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
+ blkcnt = 1 + (blkcnt >> epbs);
+ }
+ ASSERT(blkcnt <= dn->dn_nblkptr);
+ }
+
+ lastblk = blkid + nblks - 1;
while (nblks) {
dmu_buf_impl_t *dbuf;
- int err, epbs, blkoff, tochk;
-
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- blkoff = P2PHASE(blkid, 1<<epbs);
- tochk = MIN((1<<epbs) - blkoff, nblks);
-
- err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
- if (err == 0) {
- int i;
- blkptr_t *bp;
+ uint64_t ibyte, new_blkid;
+ int epb = 1 << epbs;
+ int err, i, blkoff, tochk;
+ blkptr_t *bp;
- err = dbuf_read(dbuf, NULL,
- DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
- if (err != 0) {
- txh->txh_tx->tx_err = err;
- dbuf_rele(dbuf, FTAG);
- break;
- }
-
- bp = dbuf->db.db_data;
- bp += blkoff;
-
- for (i = 0; i < tochk; i++) {
- if (dsl_dataset_block_freeable(ds,
- bp[i].blk_birth)) {
- dprintf_bp(&bp[i],
- "can free old%s", "");
- space += bp_get_dasize(spa, &bp[i]);
- }
- unref += BP_GET_ASIZE(bp);
- }
- dbuf_rele(dbuf, FTAG);
- }
- if (err && err != ENOENT) {
+ ibyte = blkid << dn->dn_datablkshift;
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
+ new_blkid = ibyte >> dn->dn_datablkshift;
+ if (err == ESRCH)
+ break;
+ if (err) {
txh->txh_tx->tx_err = err;
break;
}
+ if (new_blkid > lastblk)
+ break;
+
+ if (new_blkid > blkid) {
+ skipped += new_blkid - blkid - 1;
+ nblks -= new_blkid - blkid;
+ blkid = new_blkid;
+ }
+ blkoff = P2PHASE(blkid, epb);
+ tochk = MIN(epb - blkoff, nblks);
+
+ dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
+
+ txh->txh_memory_tohold += dbuf->db.db_size;
+ if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
+ txh->txh_tx->tx_err = E2BIG;
+ dbuf_rele(dbuf, FTAG);
+ break;
+ }
+ err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ dbuf_rele(dbuf, FTAG);
+ break;
+ }
+
+ bp = dbuf->db.db_data;
+ bp += blkoff;
+
+ for (i = 0; i < tochk; i++) {
+ if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+ dprintf_bp(&bp[i], "can free old%s", "");
+ space += bp_get_dasize(spa, &bp[i]);
+ }
+ unref += BP_GET_ASIZE(bp);
+ }
+ dbuf_rele(dbuf, FTAG);
blkid += tochk;
nblks -= tochk;
}
rw_exit(&dn->dn_struct_rwlock);
+ /* account for new level 1 indirect blocks that might show up */
+ if (skipped) {
+ skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
+ txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+ }
txh->txh_space_tofree += space;
txh->txh_space_tounref += unref;
}
@@ -471,7 +489,7 @@
/*
* For i/o error checking, read the first and last level-0
* blocks, and all the level-1 blocks. The above count_write's
- * will take care of the level-0 blocks.
+ * have already taken care of the level-0 blocks.
*/
if (dn->dn_nlevels > 1) {
shift = dn->dn_datablkshift + dn->dn_indblkshift -
@@ -483,7 +501,7 @@
NULL, NULL, ZIO_FLAG_CANFAIL);
for (i = start; i <= end; i++) {
uint64_t ibyte = i << shift;
- err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+ err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
i = ibyte >> shift;
if (err == ESRCH)
break;
@@ -706,12 +724,13 @@
match_offset = TRUE;
break;
case THT_FREE:
- if (blkid == beginblk &&
- (txh->txh_arg1 != 0 ||
- dn->dn_maxblkid == 0))
- match_offset = TRUE;
- if (blkid == endblk &&
- txh->txh_arg2 != DMU_OBJECT_END)
+ /*
+ * We will dirty all the level 1 blocks in
+ * the free range and perhaps the first and
+ * last level 0 block.
+ */
+ if (blkid >= beginblk && (blkid <= endblk ||
+ txh->txh_arg2 == DMU_OBJECT_END))
match_offset = TRUE;
break;
case THT_BONUS:
@@ -742,8 +761,8 @@
{
dmu_tx_hold_t *txh;
spa_t *spa = tx->tx_pool->dp_spa;
- uint64_t lsize, asize, fsize, usize;
- uint64_t towrite, tofree, tooverwrite, tounref;
+ uint64_t memory, asize, fsize, usize;
+ uint64_t towrite, tofree, tooverwrite, tounref, tohold;
ASSERT3U(tx->tx_txg, ==, 0);
@@ -776,7 +795,7 @@
* dmu_tx_unassign() logic.
*/
- towrite = tofree = tooverwrite = tounref = 0;
+ towrite = tofree = tooverwrite = tounref = tohold = 0;
for (txh = list_head(&tx->tx_holds); txh;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
@@ -797,6 +816,7 @@
tofree += txh->txh_space_tofree;
tooverwrite += txh->txh_space_tooverwrite;
tounref += txh->txh_space_tounref;
+ tohold += txh->txh_memory_tohold;
}
/*
@@ -817,24 +837,27 @@
tooverwrite = tofree = 0;
}
- /*
- * Convert logical size to worst-case allocated size.
- */
+ /* needed allocation: worst-case estimate of write space */
+ asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
+ /* freed space estimate: worst-case overwrite + free estimate */
fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
- lsize = towrite + tooverwrite;
- asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+ /* convert unrefd space to worst-case estimate */
usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
+ /* calculate memory footprint estimate */
+ memory = towrite + tooverwrite + tohold;
#ifdef ZFS_DEBUG
- tx->tx_space_towrite = asize;
+ /* add in 'tohold' to account for our dirty holds on this memory */
+ tx->tx_space_towrite = asize +
+ spa_get_asize(tx->tx_pool->dp_spa, tohold);
tx->tx_space_tofree = tofree;
tx->tx_space_tooverwrite = tooverwrite;
tx->tx_space_tounref = tounref;
#endif
if (tx->tx_dir && asize != 0) {
- int err = dsl_dir_tempreserve_space(tx->tx_dir,
- lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
+ int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+ asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
if (err)
return (err);
}
--- a/usr/src/uts/common/fs/zfs/dnode.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode.c Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -780,7 +780,7 @@
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
- int have_db0 = FALSE;
+ int err;
if (size == 0)
size = SPA_MINBLOCKSIZE;
@@ -805,9 +805,7 @@
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- if (db->db_blkid == 0) {
- have_db0 = TRUE;
- } else if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
}
@@ -817,12 +815,12 @@
if (ibs && dn->dn_nlevels != 1)
goto fail;
- db = NULL;
- if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
- /* obtain the old block */
- db = dbuf_hold(dn, 0, FTAG);
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+ if (err == 0)
dbuf_new_size(db, size, tx);
- }
+ else if (err != ENOENT)
+ goto fail;
dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
@@ -831,7 +829,7 @@
dn->dn_indblkshift = ibs;
dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
}
-
+ /* rele after we have fixed the blocksize in the dnode */
if (db)
dbuf_rele(db, FTAG);
@@ -969,15 +967,15 @@
{
dmu_buf_impl_t *db;
uint64_t blkoff, blkid, nblks;
- int blksz, head;
+ int blksz, blkshift, head, tail;
int trunc = FALSE;
+ int epbs;
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
blksz = dn->dn_datablksz;
+ blkshift = dn->dn_datablkshift;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- /* If the range is past the end of the file, this is a no-op */
- if (off >= blksz * (dn->dn_maxblkid+1))
- goto out;
if (len == -1ULL) {
len = UINT64_MAX - off;
trunc = TRUE;
@@ -989,11 +987,18 @@
if (ISP2(blksz)) {
head = P2NPHASE(off, blksz);
blkoff = P2PHASE(off, blksz);
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
- /* Freeing the whole block; don't do any head. */
- head = 0;
+ /* Freeing the whole block; fast-track this request */
+ blkid = 0;
+ nblks = 1;
+ goto done;
+ } else if (off > blkid) {
+ /* Freeing past end-of-data */
+ goto out;
} else {
/* Freeing part of the block. */
head = blksz - off;
@@ -1026,88 +1031,85 @@
}
/* If the range was less than one block, we're done */
- if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+ if (len == 0)
+ goto out;
+
+ ASSERT(ISP2(blksz));
+ /* If the remaining range is past end of file, we're done */
+ if ((off >> blkshift) > dn->dn_maxblkid)
+ goto out;
+
+ if (trunc)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+ TRUE, FTAG, &db) == 0) {
+ /* don't dirty if not on disk and not dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ bzero(db->db.db_data, tail);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ len -= tail;
+ }
+
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
goto out;
- if (!ISP2(blksz)) {
- /*
- * They are freeing the whole block of a
- * non-power-of-two blocksize file. Skip all the messy
- * math.
- */
- ASSERT3U(off, ==, 0);
- ASSERT3U(len, >=, blksz);
- blkid = 0;
- nblks = 1;
- } else {
- int tail;
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- int blkshift = dn->dn_datablkshift;
+ ASSERT(IS_P2ALIGNED(off, blksz));
+ ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+ if (trunc)
+ nblks += 1;
- /* If the remaining range is past end of file, we're done */
- if (off > dn->dn_maxblkid << blkshift)
- goto out;
+ /*
+ * Read in and mark all the level-1 indirects dirty,
+ * so that they will stay in memory until syncing phase.
+ */
+ if (dn->dn_nlevels > 1) {
+ uint64_t i, first, last;
+ int shift = epbs + dn->dn_datablkshift;
- if (off + len == UINT64_MAX)
- tail = 0;
+ first = blkid >> epbs;
+ if (trunc)
+ last = dn->dn_maxblkid >> epbs;
else
- tail = P2PHASE(len, blksz);
+ last = (blkid + nblks - 1) >> epbs;
+ for (i = first; i <= last; i++) {
+ uint64_t ibyte = i << shift;
+ int err;
- ASSERT3U(P2PHASE(off, blksz), ==, 0);
- /* zero out any partial block data at the end of the range */
- if (tail) {
- if (len < tail)
- tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
- /* don't dirty if not on disk and not dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr &&
- !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
- rw_enter(&dn->dn_struct_rwlock,
- RW_WRITER);
- bzero(db->db.db_data, tail);
- }
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH || i > last)
+ break;
+ ASSERT(err == 0);
+ db = dbuf_hold_level(dn, 1, i, FTAG);
+ if (db) {
+ dbuf_will_dirty(db, tx);
dbuf_rele(db, FTAG);
}
- len -= tail;
}
- /* If the range did not include a full block, we are done */
- if (len == 0)
- goto out;
-
- /* dirty the left indirects */
- if (dn->dn_nlevels > 1 && off != 0) {
- db = dbuf_hold_level(dn, 1,
- (off - head) >> (blkshift + epbs), FTAG);
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
-
- /* dirty the right indirects */
- if (dn->dn_nlevels > 1 && !trunc) {
- db = dbuf_hold_level(dn, 1,
- (off + len + tail - 1) >> (blkshift + epbs), FTAG);
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
-
- /*
- * Finally, add this range to the dnode range list, we
- * will finish up this free operation in the syncing phase.
- */
- ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
- ASSERT(off + len == UINT64_MAX ||
- IS_P2ALIGNED(len, 1<<blkshift));
- blkid = off >> blkshift;
- nblks = len >> blkshift;
-
- if (trunc)
- dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
}
-
+done:
+ /*
+ * Add this range to the dnode range list.
+ * We will finish up this free operation in the syncing phase.
+ */
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, blkid, nblks, tx);
{
@@ -1127,9 +1129,12 @@
}
mutex_exit(&dn->dn_mtx);
- dbuf_free_range(dn, blkid, nblks, tx);
+ dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
+ if (trunc && dn->dn_maxblkid >= (off >> blkshift))
+ dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
+
rw_exit(&dn->dn_struct_rwlock);
}
@@ -1229,7 +1234,7 @@
}
static int
-dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
@@ -1237,11 +1242,15 @@
uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
uint64_t epb = 1ULL << epbs;
uint64_t minfill, maxfill;
- int i, error, span;
+ boolean_t hole;
+ int i, inc, error, span;
dprintf("probing object %llu offset %llx level %d of %u\n",
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+ hole = flags & DNODE_FIND_HOLE;
+ inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+
if (lvl == dn->dn_phys->dn_nlevels) {
error = 0;
epb = dn->dn_phys->dn_nblkptr;
@@ -1270,7 +1279,8 @@
span = DNODE_SHIFT;
ASSERT(dn->dn_type == DMU_OT_DNODE);
- for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+ for (i = (*offset >> span) & (blkfill - 1);
+ i >= 0 && i < blkfill; i += inc) {
boolean_t newcontents = B_TRUE;
if (txg) {
int j;
@@ -1282,9 +1292,9 @@
}
if (!dnp[i].dn_type == hole && newcontents)
break;
- *offset += 1ULL << span;
+ *offset += (1ULL << span) * inc;
}
- if (i == blkfill)
+ if (i < 0 || i == blkfill)
error = ESRCH;
} else {
blkptr_t *bp = data;
@@ -1298,14 +1308,14 @@
minfill++;
for (i = (*offset >> span) & ((1ULL << epbs) - 1);
- i < epb; i++) {
+ i >= 0 && i < epb; i += inc) {
if (bp[i].blk_fill >= minfill &&
bp[i].blk_fill <= maxfill &&
bp[i].blk_birth > txg)
break;
- *offset += 1ULL << span;
+ *offset += (1ULL << span) * inc;
}
- if (i >= epb)
+ if (i < 0 || i == epb)
error = ESRCH;
}
@@ -1324,64 +1334,66 @@
*
* Examples:
*
- * dnode_next_offset(dn, hole, offset, 1, 1, 0);
- * Finds the next hole/data in a file.
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ * Finds the next/previous hole/data in a file.
* Used in dmu_offset_next().
*
- * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
* Finds the next free/allocated dnode an objset's meta-dnode.
* Only finds objects that have new contents since txg (ie.
* bonus buffer changes and content removal are ignored).
* Used in dmu_object_next().
*
- * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
* Finds the next L2 meta-dnode bp that's at most 1/4 full.
* Used in dmu_object_alloc().
*/
int
-dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg)
{
+ uint64_t initial_offset = *offset;
int lvl, maxlvl;
int error = 0;
- uint64_t initial_offset = *offset;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_phys->dn_nlevels == 0) {
- rw_exit(&dn->dn_struct_rwlock);
- return (ESRCH);
+ error = ESRCH;
+ goto out;
}
if (dn->dn_datablkshift == 0) {
if (*offset < dn->dn_datablksz) {
- if (hole)
+ if (flags & DNODE_FIND_HOLE)
*offset = dn->dn_datablksz;
} else {
error = ESRCH;
}
- rw_exit(&dn->dn_struct_rwlock);
- return (error);
+ goto out;
}
maxlvl = dn->dn_phys->dn_nlevels;
for (lvl = minlvl; lvl <= maxlvl; lvl++) {
error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
+ flags, offset, lvl, blkfill, txg);
if (error != ESRCH)
break;
}
- while (--lvl >= minlvl && error == 0) {
+ while (error == 0 && --lvl >= minlvl) {
error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
+ flags, offset, lvl, blkfill, txg);
}
- rw_exit(&dn->dn_struct_rwlock);
-
- if (error == 0 && initial_offset > *offset)
+ if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+ initial_offset < *offset : initial_offset > *offset))
error = ESRCH;
+out:
+ if (!(flags & DNODE_FIND_HAVELOCK))
+ rw_exit(&dn->dn_struct_rwlock);
return (error);
}
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -109,25 +109,26 @@
rw_exit(&dn->dn_struct_rwlock);
}
-static void
+static int
free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
{
- objset_impl_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint64_t bytesfreed = 0;
- int i;
+ int i, blocks_freed = 0;
- dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+ dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
for (i = 0; i < num; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
- bytesfreed += bp_get_dasize(os->os_spa, bp);
+ bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
- dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
bzero(bp, sizeof (blkptr_t));
+ blocks_freed += 1;
}
dnode_diduse_space(dn, -bytesfreed);
+ return (blocks_freed);
}
#ifdef ZFS_DEBUG
@@ -205,6 +206,8 @@
}
#endif
+#define ALL -1
+
static int
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
dmu_tx_t *tx)
@@ -215,8 +218,18 @@
uint64_t start, end, dbstart, dbend, i;
int epbs, shift, err;
int all = TRUE;
+ int blocks_freed = 0;
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ /*
+ * There is a small possibility that this block will not be cached:
+ * 1 - if level > 1 and there are no children with level <= 1
+ * 2 - if we didn't get a dirty hold (because this block had just
+ * finished being written -- and so had no holds), and then this
+ * block got evicted before we got here.
+ */
+ if (db->db_state != DB_CACHED)
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+
arc_release(db->db_buf, db);
bp = (blkptr_t *)db->db.db_data;
@@ -240,10 +253,10 @@
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
- free_blocks(dn, bp, end-start+1, tx);
+ blocks_freed = free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
- ASSERT(all || db->db_last_dirty);
- return (all);
+ ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ return (all ? ALL : blocks_freed);
}
for (i = start; i <= end; i++, bp++) {
@@ -254,9 +267,9 @@
ASSERT3U(err, ==, 0);
rw_exit(&dn->dn_struct_rwlock);
- if (free_children(subdb, blkid, nblks, trunc, tx)) {
+ if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(subdb->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
+ blocks_freed += free_blocks(dn, bp, 1, tx);
} else {
all = FALSE;
}
@@ -273,8 +286,8 @@
ASSERT3U(bp->blk_birth, ==, 0);
}
#endif
- ASSERT(all || db->db_last_dirty);
- return (all);
+ ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
+ return (all ? ALL : blocks_freed);
}
/*
@@ -304,15 +317,14 @@
return;
}
ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
- free_blocks(dn, bp + blkid, nblks, tx);
+ (void) free_blocks(dn, bp + blkid, nblks, tx);
if (trunc) {
uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off,
- 1, 1, 0) != 0);
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
}
return;
}
@@ -330,9 +342,9 @@
ASSERT3U(err, ==, 0);
rw_exit(&dn->dn_struct_rwlock);
- if (free_children(db, blkid, nblks, trunc, tx)) {
+ if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
ASSERT3P(db->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
+ (void) free_blocks(dn, bp, 1, tx);
}
dbuf_rele(db, FTAG);
}
@@ -342,7 +354,7 @@
dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+ dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
}
}
@@ -442,6 +454,13 @@
ASSERT(dmu_tx_is_syncing(tx));
+ /*
+ * Our contents should have been freed in dnode_sync() by the
+ * free range record inserted by the caller of dnode_free().
+ */
+ ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+ ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
@@ -461,10 +480,6 @@
dn->dn_next_indblkshift[txgoff] = 0;
dn->dn_next_blksz[txgoff] = 0;
- /* free up all the blocks in the file. */
- dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
- ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
-
/* ASSERT(blkptrs are zero); */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -541,7 +556,7 @@
ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
SPA_MINBLOCKSIZE) == 0);
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
- list_head(list) != NULL ||
+ dn->dn_maxblkid == 0 || list_head(list) != NULL ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
dnp->dn_datablkszsec);
dnp->dn_datablkszsec =
@@ -575,22 +590,15 @@
mutex_exit(&dn->dn_mtx);
/* process all the "freed" ranges in the file */
- if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
- for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
- rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
- dnode_sync_free_range(dn,
- rp->fr_blkid, rp->fr_nblks, tx);
+ while (rp = avl_last(&dn->dn_ranges[txgoff])) {
+ dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
+ /* grab the mutex so we don't race with dnode_block_freed() */
+ mutex_enter(&dn->dn_mtx);
+ avl_remove(&dn->dn_ranges[txgoff], rp);
+ mutex_exit(&dn->dn_mtx);
+ kmem_free(rp, sizeof (free_range_t));
}
- /* grab the mutex so we don't race with dnode_block_freed() */
- mutex_enter(&dn->dn_mtx);
- for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
- free_range_t *last = rp;
- rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
- avl_remove(&dn->dn_ranges[txgoff], last);
- kmem_free(last, sizeof (free_range_t));
- }
- mutex_exit(&dn->dn_mtx);
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
dnode_sync_free(dn, tx);
return;
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Tue Jul 01 12:01:12 2008 -0700
@@ -115,7 +115,7 @@
dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx);
}
-void
+int
dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_tx_t *tx)
{
@@ -126,7 +126,7 @@
ASSERT(dmu_tx_is_syncing(tx));
/* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
- return;
+ return (0);
ASSERT(used > 0);
if (ds == NULL) {
@@ -142,7 +142,7 @@
dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
-used, -compressed, -uncompressed, tx);
dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
- return;
+ return (used);
}
ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
@@ -189,6 +189,8 @@
ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
mutex_exit(&ds->ds_lock);
+
+ return (used);
}
uint64_t
@@ -957,21 +959,11 @@
*/
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
ds->ds_phys->ds_prev_snap_txg)) {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- /*
- * Perhaps there is not enough disk
- * space. Just deal with it from
- * dsl_dataset_destroy_sync().
- */
- dmu_tx_abort(tx);
- continue;
- }
- VERIFY(0 == dmu_object_free(os, obj, tx));
- dmu_tx_commit(tx);
+ /*
+ * Ignore errors, if there is not enough disk space
+ * we will deal with it in dsl_dataset_destroy_sync().
+ */
+ (void) dmu_free_object(os, obj);
}
dmu_objset_close(os);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -271,7 +271,7 @@
void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
-void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
struct dmu_tx *);
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Jul 01 12:01:12 2008 -0700
@@ -154,6 +154,7 @@
* operation, including metadata.
*/
#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
/*
* Public routines to create, destroy, open, and close objsets.
@@ -421,6 +422,9 @@
*/
int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx);
+int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size);
+int dmu_free_object(objset_t *os, uint64_t object);
/*
* Convenience functions.
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -89,6 +89,7 @@
uint64_t txh_space_tofree;
uint64_t txh_space_tooverwrite;
uint64_t txh_space_tounref;
+ uint64_t txh_memory_tohold;
#ifdef ZFS_DEBUG
enum dmu_tx_hold_type txh_type;
uint64_t txh_arg1;
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Jul 01 12:01:12 2008 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -41,12 +41,19 @@
#endif
/*
- * Flags.
+ * dnode_hold() flags.
*/
#define DNODE_MUST_BE_ALLOCATED 1
#define DNODE_MUST_BE_FREE 2
/*
+ * dnode_next_offset() flags.
+ */
+#define DNODE_FIND_HOLE 1
+#define DNODE_FIND_BACKWARDS 2
+#define DNODE_FIND_HAVELOCK 4
+
+/*
* Fixed constants.
*/
#define DNODE_SHIFT 9 /* 512 bytes */
@@ -227,8 +234,8 @@
uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
void dnode_init(void);
void dnode_fini(void);
-int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
- uint64_t blkfill, uint64_t txg);
+int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
+ int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
#ifdef ZFS_DEBUG
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Tue Jul 01 12:01:12 2008 -0700
@@ -191,7 +191,7 @@
void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_tx_t *tx);
int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Jul 01 12:01:12 2008 -0700
@@ -451,6 +451,21 @@
ASSERT3U(error, ==, 0);
}
+static void
+zfs_unlinked_remove(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ char obj_name[17];
+ int error;
+
+ ASSERT(zp->z_unlinked);
+ ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+ error = zap_remove(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ zfs_unlinked_hexname(obj_name, zp->z_id), tx);
+ ASSERT3U(error, ==, 0);
+}
+
/*
* Clean up any znodes that had no links when we either crashed or
* (force) umounted the file system.
@@ -574,7 +589,6 @@
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zfsvfs->z_os;
znode_t *xzp = NULL;
- char obj_name[17];
dmu_tx_t *tx;
uint64_t acl_obj;
int error;
@@ -589,7 +603,7 @@
if (zfs_purgedir(zp) != 0) {
/*
* Not enough space to delete some xattrs.
- * Leave it on the unlinked set.
+ * Leave it in the unlinked set.
*/
zfs_znode_dmu_fini(zp);
zfs_znode_free(zp);
@@ -598,6 +612,19 @@
}
/*
+ * Free up all the data in the file.
+ */
+ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+ if (error) {
+ /*
+ * Not enough space. Leave the file in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return;
+ }
+
+ /*
* If the file has extended attributes, we're going to unlink
* the xattr dir.
*/
@@ -609,7 +636,7 @@
acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
/*
- * Set up the transaction.
+ * Set up the final transaction.
*/
tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
@@ -643,9 +670,7 @@
}
/* Remove this znode from the unlinked set */
- error = zap_remove(os, zfsvfs->z_unlinkedobj,
- zfs_unlinked_hexname(obj_name, zp->z_id), tx);
- ASSERT3U(error, ==, 0);
+ zfs_unlinked_remove(zp, tx);
zfs_znode_delete(zp, tx);
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Jul 01 12:01:12 2008 -0700
@@ -1304,15 +1304,10 @@
*/
if ((ZTOV(zp)->v_type == VREG) &&
(vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+ /* we can't hold any locks when calling zfs_freesp() */
+ zfs_dirent_unlock(dl);
+ dl = NULL;
error = zfs_freesp(zp, 0, 0, mode, TRUE);
- if (error == ERESTART &&
- zfsvfs->z_assign == TXG_NOWAIT) {
- /* NB: we already did dmu_tx_wait() */
- zfs_dirent_unlock(dl);
- VN_RELE(ZTOV(zp));
- goto top;
- }
-
if (error == 0) {
vnevent_create(ZTOV(zp), ct);
}
@@ -1379,7 +1374,7 @@
zfs_dirlock_t *dl;
dmu_tx_t *tx;
boolean_t may_delete_now, delete_now = FALSE;
- boolean_t unlinked;
+ boolean_t unlinked, toobig = FALSE;
uint64_t txtype;
pathname_t *realnmp = NULL;
pathname_t realnm;
@@ -1442,8 +1437,13 @@
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_bonus(tx, zp->z_id);
- if (may_delete_now)
- dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ if (may_delete_now) {
+ toobig =
+ zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
+ /* if the file is too big, only hold_free a token amount */
+ dmu_tx_hold_free(tx, zp->z_id, 0,
+ (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+ }
/* are there any extended attributes? */
if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
@@ -1487,7 +1487,7 @@
if (unlinked) {
mutex_enter(&vp->v_lock);
- delete_now = may_delete_now &&
+ delete_now = may_delete_now && !toobig &&
vp->v_count == 1 && !vn_has_cached_data(vp) &&
zp->z_phys->zp_xattr == xattr_obj &&
zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
@@ -1533,7 +1533,7 @@
if (!delete_now) {
VN_RELE(vp);
} else if (xzp) {
- /* this rele delayed to prevent nesting transactions */
+ /* this rele is delayed to prevent nesting transactions */
VN_RELE(ZTOV(xzp));
}
@@ -2451,10 +2451,8 @@
* block if there are locks present... this
* should be addressed in openat().
*/
- do {
- err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
- /* NB: we already did dmu_tx_wait() if necessary */
- } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
+ /* XXX - would it be OK to generate a log record here? */
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
if (err) {
ZFS_EXIT(zfsvfs);
return (err);
@@ -2725,6 +2723,7 @@
if (mask & AT_MTIME)
ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
if (mask & AT_SIZE)
zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
else if (mask != 0)
@@ -4236,7 +4235,6 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
-top:
if (cmd != F_FREESP) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
@@ -4255,10 +4253,7 @@
off = bfp->l_start;
len = bfp->l_len; /* 0 means from off to end of file */
- do {
- error = zfs_freesp(zp, off, len, flag, TRUE);
- /* NB: we already did dmu_tx_wait() if necessary */
- } while (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
+ error = zfs_freesp(zp, off, len, flag, TRUE);
ZFS_EXIT(zfsvfs);
return (error);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Jul 01 12:01:12 2008 -0700
@@ -1046,14 +1046,14 @@
zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
uint64_t obj = zp->z_id;
+ uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- if (zp->z_phys->zp_acl.z_acl_extern_obj) {
- VERIFY(0 == dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx));
- }
- VERIFY(0 == dmu_object_free(zfsvfs->z_os, obj, tx));
+ if (acl_obj)
+ VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ VERIFY(0 == dmu_object_free(os, obj, tx));
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
zfs_znode_free(zp);
@@ -1233,137 +1233,177 @@
}
/*
- * Free space in a file.
+ * Increase the file length
*
* IN: zp - znode of file to free data in.
- * off - start of section to free.
- * len - length of section to free (0 => to EOF).
- * flag - current file open mode flags.
+ * end - new end-of-file
*
* RETURN: 0 if success
* error code if failure
*/
-int
-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+static int
+zfs_extend(znode_t *zp, uint64_t end)
{
- vnode_t *vp = ZTOV(zp);
- dmu_tx_t *tx;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
+ dmu_tx_t *tx;
rl_t *rl;
- uint64_t end = off + len;
- uint64_t size, new_blksz;
- uint64_t pflags = zp->z_phys->zp_flags;
+ uint64_t newblksz;
int error;
- if ((pflags & (ZFS_IMMUTABLE|ZFS_READONLY)) ||
- off < zp->z_phys->zp_size && (pflags & ZFS_APPENDONLY))
- return (EPERM);
-
- if (ZTOV(zp)->v_type == VFIFO)
- return (0);
-
/*
- * If we will change zp_size then lock the whole file,
- * otherwise just lock the range being freed.
+ * We will change zp_size, lock the whole file.
*/
- if (len == 0 || off + len > zp->z_phys->zp_size) {
- rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
- } else {
- rl = zfs_range_lock(zp, off, len, RL_WRITER);
- /* recheck, in case zp_size changed */
- if (off + len > zp->z_phys->zp_size) {
- /* lost race: file size changed, lock whole file */
- zfs_range_unlock(rl);
- rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
- }
- }
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
/*
* Nothing to do if file already at desired length.
*/
- size = zp->z_phys->zp_size;
- if (len == 0 && size == off && off != 0) {
+ if (end <= zp->z_phys->zp_size) {
zfs_range_unlock(rl);
return (0);
}
-
- /*
- * Check for any locks in the region to be freed.
- */
- if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
- uint64_t start = off;
- uint64_t extent = len;
-
- if (off > size) {
- start = size;
- extent += off - size;
- } else if (len == 0) {
- extent = size - off;
- }
- if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) {
- zfs_range_unlock(rl);
- return (error);
- }
- }
-
+top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id);
- new_blksz = 0;
- if (end > size &&
+ if (end > zp->z_blksz &&
(!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
/*
* We are growing the file past the current block size.
*/
if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+ newblksz = MIN(end, SPA_MAXBLOCKSIZE);
} else {
- new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
}
- dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
- } else if (off < size) {
- /*
- * If len == 0, we are truncating the file.
- */
- dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+ } else {
+ newblksz = 0;
}
error = dmu_tx_assign(tx, zfsvfs->z_assign);
if (error) {
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
dmu_tx_abort(tx);
zfs_range_unlock(rl);
return (error);
}
-
- if (new_blksz)
- zfs_grow_blocksize(zp, new_blksz, tx);
-
- if (end > size || len == 0)
- zp->z_phys->zp_size = end;
-
- if (off < size) {
- objset_t *os = zfsvfs->z_os;
- uint64_t rlen = len;
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
- if (len == 0)
- rlen = -1;
- else if (end > size)
- rlen = size - off;
- VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
- }
+ if (newblksz)
+ zfs_grow_blocksize(zp, newblksz, tx);
- if (log) {
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
- zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
- }
+ zp->z_phys->zp_size = end;
zfs_range_unlock(rl);
dmu_tx_commit(tx);
+ return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ rl_t *rl;
+ int error;
+
+ /*
+ * Lock the range being freed.
+ */
+ rl = zfs_range_lock(zp, off, len, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (off >= zp->z_phys->zp_size) {
+ zfs_range_unlock(rl);
+ return (0);
+ }
+
+ if (off + len > zp->z_phys->zp_size)
+ len = zp->z_phys->zp_size - off;
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+ zfs_range_unlock(rl);
+
+ return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ rl_t *rl;
+ int error;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end >= zp->z_phys->zp_size) {
+ zfs_range_unlock(rl);
+ return (0);
+ }
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
+ if (error) {
+ zfs_range_unlock(rl);
+ return (error);
+ }
+top:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zfs_range_unlock(rl);
+ return (error);
+ }
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ zp->z_phys->zp_size = end;
+
+ dmu_tx_commit(tx);
+
+ zfs_range_unlock(rl);
+
/*
* Clear any mapped pages in the truncated region. This has to
* happen outside of the transaction to avoid the possibility of
@@ -1371,10 +1411,10 @@
* about to invalidate.
*/
rw_enter(&zp->z_map_lock, RW_WRITER);
- if (off < size && vn_has_cached_data(vp)) {
+ if (vn_has_cached_data(vp)) {
page_t *pp;
- uint64_t start = off & PAGEMASK;
- int poff = off & PAGEOFFSET;
+ uint64_t start = end & PAGEMASK;
+ int poff = end & PAGEOFFSET;
if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
/*
@@ -1393,6 +1433,74 @@
return (0);
}
+/*
+ * Free space in a file
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of range
+ * len - end of range (0 => EOF)
+ * flag - current file open mode flags.
+ * log - TRUE if this action should be logged
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ int error;
+
+ if (off > zp->z_phys->zp_size) {
+ error = zfs_extend(zp, off+len);
+ if (error == 0 && log)
+ goto log;
+ else
+ return (error);
+ }
+
+ /*
+ * Check for any locks in the region to be freed.
+ */
+ if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
+ uint64_t length = (len ? len : zp->z_phys->zp_size - off);
+ if (error = chklock(vp, FWRITE, off, length, flag, NULL))
+ return (error);
+ }
+
+ if (len == 0) {
+ error = zfs_trunc(zp, off);
+ } else {
+ if ((error = zfs_free_range(zp, off, len)) == 0 &&
+ off + len > zp->z_phys->zp_size)
+ error = zfs_extend(zp, off+len);
+ }
+ if (error || !log)
+ return (error);
+log:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto log;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
void
zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
{
--- a/usr/src/uts/common/fs/zfs/zvol.c Tue Jul 01 11:24:56 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c Tue Jul 01 12:01:12 2008 -0700
@@ -774,24 +774,6 @@
return (0);
}
-static int
-zvol_truncate(zvol_state_t *zv, uint64_t offset, uint64_t size)
-{
- dmu_tx_t *tx;
- int error;
-
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_free(tx, ZVOL_OBJ, offset, size);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- return (error);
- }
- error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, offset, size, tx);
- dmu_tx_commit(tx);
- return (0);
-}
-
int
zvol_prealloc(zvol_state_t *zv)
{
@@ -823,7 +805,7 @@
if (error) {
dmu_tx_abort(tx);
kmem_free(data, SPA_MAXBLOCKSIZE);
- (void) zvol_truncate(zv, 0, off);
+ (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
return (error);
}
dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
@@ -847,7 +829,6 @@
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
- dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -859,7 +840,8 @@
dmu_tx_commit(tx);
if (error == 0)
- error = zvol_truncate(zv, volsize, DMU_OBJECT_END);
+ error = dmu_free_long_range(zv->zv_objset,
+ ZVOL_OBJ, volsize, DMU_OBJECT_END);
if (error == 0) {
zv->zv_volsize = volsize;
@@ -1651,7 +1633,6 @@
ASSERT(MUTEX_HELD(&zvol_state_lock));
tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, ZVOL_OBJ, 0, DMU_OBJECT_END);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
@@ -1690,7 +1671,8 @@
/* Truncate the file */
if (!error)
- error = zvol_truncate(zv, 0, DMU_OBJECT_END);
+ error = dmu_free_long_range(zv->zv_objset,
+ ZVOL_OBJ, 0, DMU_OBJECT_END);
if (error)
return (error);
@@ -1813,7 +1795,7 @@
(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
zvol_free_extents(zv);
- (void) zvol_truncate(zv, 0, DMU_OBJECT_END);
+ (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
zv->zv_flags &= ~ZVOL_DUMPIFIED;
dmu_tx_commit(tx);