--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c Sat Mar 03 05:56:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c Sat Mar 03 08:20:50 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -108,26 +108,38 @@
uint64_t end_size;
uint64_t off = new->r_off;
uint64_t len = new->r_len;
- int max_blksz = zp->z_zfsvfs->z_max_blksz;
for (;;) {
/*
- * If in append mode pick up the current end of file.
- * This is done under z_range_lock to avoid races.
+ * Range locking is also used by zvol and uses a
+ * dummied up znode. However, for zvol, we don't need to
+ * append or grow blocksize, and besides we don't have
+ * a z_phys or z_zfsvfs - so skip that processing.
+ *
+ * Yes, this is ugly, and would be solved by not handling
+ * grow or append in range lock code. If that was done then
+ * we could make the range locking code generically available
+ * to other non-zfs consumers.
*/
- if (new->r_type == RL_APPEND)
- new->r_off = zp->z_phys->zp_size;
+ if (zp->z_vnode) { /* caller is ZPL */
+ /*
+ * If in append mode pick up the current end of file.
+ * This is done under z_range_lock to avoid races.
+ */
+ if (new->r_type == RL_APPEND)
+ new->r_off = zp->z_phys->zp_size;
- /*
- * If we need to grow the block size then grab the whole
- * file range. This is also done under z_range_lock to
- * avoid races.
- */
- end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
- if (end_size > zp->z_blksz &&
- (!ISP2(zp->z_blksz) || zp->z_blksz < max_blksz)) {
- new->r_off = 0;
- new->r_len = UINT64_MAX;
+ /*
+ * If we need to grow the block size then grab the whole
+ * file range. This is also done under z_range_lock to
+ * avoid races.
+ */
+ end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+ new->r_off = 0;
+ new->r_len = UINT64_MAX;
+ }
}
/*
--- a/usr/src/uts/common/fs/zfs/zvol.c Sat Mar 03 05:56:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zvol.c Sat Mar 03 08:20:50 2007 -0800
@@ -68,6 +68,8 @@
#include <sys/mkdev.h>
#include <sys/zil.h>
#include <sys/refcount.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
#include "zfs_namecheck.h"
@@ -101,7 +103,7 @@
uint32_t zv_total_opens; /* total open count */
zilog_t *zv_zilog; /* ZIL handle */
uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
- krwlock_t zv_dslock; /* dmu_sync() rwlock */
+ znode_t zv_znode; /* for range locking */
} zvol_state_t;
/*
@@ -437,14 +439,16 @@
zv->zv_objset = os;
zv->zv_mode = ds_mode;
zv->zv_zilog = zil_open(os, zvol_get_data);
+ mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
+ sizeof (rl_t), offsetof(rl_t, r_node));
+
/* get and cache the blocksize */
error = dmu_object_info(os, ZVOL_OBJ, &doi);
ASSERT(error == 0);
zv->zv_volblocksize = doi.doi_data_block_size;
- rw_init(&zv->zv_dslock, NULL, RW_DEFAULT, NULL);
-
zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
zvol_size_changed(zv, dev);
@@ -494,6 +498,8 @@
zv->zv_zilog = NULL;
dmu_objset_close(zv->zv_objset);
zv->zv_objset = NULL;
+ avl_destroy(&zv->zv_znode.z_range_avl);
+ mutex_destroy(&zv->zv_znode.z_range_lock);
ddi_soft_state_free(zvol_state, zv->zv_minor);
@@ -682,8 +688,10 @@
zvol_get_done(dmu_buf_t *db, void *vzgd)
{
zgd_t *zgd = (zgd_t *)vzgd;
+ rl_t *rl = zgd->zgd_rl;
dmu_buf_rele(db, vzgd);
+ zfs_range_unlock(rl);
zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
kmem_free(zgd, sizeof (zgd_t));
}
@@ -697,30 +705,42 @@
zvol_state_t *zv = arg;
objset_t *os = zv->zv_objset;
dmu_buf_t *db;
+ rl_t *rl;
zgd_t *zgd;
+ uint64_t boff; /* block starting offset */
+ int dlen = lr->lr_length; /* length of user data */
int error;
ASSERT(zio);
- ASSERT(lr->lr_length != 0);
+ ASSERT(dlen != 0);
- if (buf != NULL)
- return (dmu_read(os, ZVOL_OBJ,
- lr->lr_offset, lr->lr_length, buf));
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) /* immediate write */
+ return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_zilog = zv->zv_zilog;
zgd->zgd_bp = &lr->lr_blkptr;
+ /*
+ * Lock the range of the block to ensure that when the data is
+ * written out and it's checksum is being calculated that no other
+ * thread can change the block.
+ */
+ boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
+ rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
+ RL_READER);
+ zgd->zgd_rl = rl;
+
VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
- /*
- * Have to lock to ensure when when the data is
- * written out and it's checksum is being calculated
- * that no one can change the data.
- */
- rw_enter(&zv->zv_dslock, RW_READER);
error = dmu_sync(zio, db, &lr->lr_blkptr,
lr->lr_common.lrc_txg, zvol_get_done, zgd);
- rw_exit(&zv->zv_dslock);
if (error == 0)
zil_add_vdev(zv->zv_zilog,
DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
@@ -733,6 +753,7 @@
if (error == EINPROGRESS)
return (0);
dmu_buf_rele(db, zgd);
+ zfs_range_unlock(rl);
kmem_free(zgd, sizeof (zgd_t));
return (error);
}
@@ -779,9 +800,10 @@
size_t size, resid;
char *addr;
objset_t *os;
+ rl_t *rl;
int error = 0;
int sync;
- int reading;
+ boolean_t reading;
if (zv == NULL) {
bioerror(bp, ENXIO);
@@ -819,10 +841,8 @@
* A better approach than a per zvol rwlock would be to lock ranges.
*/
reading = bp->b_flags & B_READ;
- if (reading || resid <= zvol_immediate_write_sz)
- rw_enter(&zv->zv_dslock, RW_READER);
- else
- rw_enter(&zv->zv_dslock, RW_WRITER);
+ rl = zfs_range_lock(&zv->zv_znode, off, resid,
+ reading ? RL_READER : RL_WRITER);
while (resid != 0 && off < volsize) {
@@ -851,7 +871,7 @@
addr += size;
resid -= size;
}
- rw_exit(&zv->zv_dslock);
+ zfs_range_unlock(rl);
if ((bp->b_resid = resid) == bp->b_bcount)
bioerror(bp, off > volsize ? EINVAL : error);
@@ -884,8 +904,11 @@
zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
{
zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev));
+ rl_t *rl;
int error = 0;
+ rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+ RL_READER);
while (uio->uio_resid > 0) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
@@ -893,6 +916,7 @@
if (error)
break;
}
+ zfs_range_unlock(rl);
return (error);
}
@@ -901,8 +925,11 @@
zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
{
zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev));
+ rl_t *rl;
int error = 0;
+ rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+ RL_WRITER);
while (uio->uio_resid > 0) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio->uio_loffset;
@@ -922,6 +949,7 @@
if (error)
break;
}
+ zfs_range_unlock(rl);
return (error);
}