upstream/illumos/illumos-gate: changeset 3755:8708c35cb823

--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c	Sat Mar 03 05:56:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c	Sat Mar 03 08:20:50 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -108,26 +108,38 @@
 	uint64_t end_size;
 	uint64_t off = new->r_off;
 	uint64_t len = new->r_len;
-	int max_blksz = zp->z_zfsvfs->z_max_blksz;
 
 	for (;;) {
 		/*
-		 * If in append mode pick up the current end of file.
-		 * This is done under z_range_lock to avoid races.
+		 * Range locking is also used by zvol and uses a
+		 * dummied up znode. However, for zvol, we don't need to
+		 * append or grow blocksize, and besides we don't have
+		 * a z_phys or z_zfsvfs - so skip that processing.
+		 *
+		 * Yes, this is ugly, and would be solved by not handling
+		 * grow or append in range lock code. If that was done then
+		 * we could make the range locking code generically available
+		 * to other non-zfs consumers.
 		 */
-		if (new->r_type == RL_APPEND)
-			new->r_off = zp->z_phys->zp_size;
+		if (zp->z_vnode) { /* caller is ZPL */
+			/*
+			 * If in append mode pick up the current end of file.
+			 * This is done under z_range_lock to avoid races.
+			 */
+			if (new->r_type == RL_APPEND)
+				new->r_off = zp->z_phys->zp_size;
 
-		/*
-		 * If we need to grow the block size then grab the whole
-		 * file range. This is also done under z_range_lock to
-		 * avoid races.
-		 */
-		end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
-		if (end_size > zp->z_blksz &&
-		    (!ISP2(zp->z_blksz) || zp->z_blksz < max_blksz)) {
-			new->r_off = 0;
-			new->r_len = UINT64_MAX;
+			/*
+			 * If we need to grow the block size then grab the whole
+			 * file range. This is also done under z_range_lock to
+			 * avoid races.
+			 */
+			end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+			if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+			    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+				new->r_off = 0;
+				new->r_len = UINT64_MAX;
+			}
 		}
 
 		/*

--- a/usr/src/uts/common/fs/zfs/zvol.c	Sat Mar 03 05:56:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Sat Mar 03 08:20:50 2007 -0800
@@ -68,6 +68,8 @@
 #include <sys/mkdev.h>
 #include <sys/zil.h>
 #include <sys/refcount.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
 
 #include "zfs_namecheck.h"
 
@@ -101,7 +103,7 @@
 	uint32_t	zv_total_opens;	/* total open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
 	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
-	krwlock_t	zv_dslock;	/* dmu_sync() rwlock */
+	znode_t		zv_znode;	/* for range locking */
 } zvol_state_t;
 
 /*
@@ -437,14 +439,16 @@
 	zv->zv_objset = os;
 	zv->zv_mode = ds_mode;
 	zv->zv_zilog = zil_open(os, zvol_get_data);
+	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
+	    sizeof (rl_t), offsetof(rl_t, r_node));
+
 
 	/* get and cache the blocksize */
 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
 	ASSERT(error == 0);
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
-	rw_init(&zv->zv_dslock, NULL, RW_DEFAULT, NULL);
-
 	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
 
 	zvol_size_changed(zv, dev);
@@ -494,6 +498,8 @@
 	zv->zv_zilog = NULL;
 	dmu_objset_close(zv->zv_objset);
 	zv->zv_objset = NULL;
+	avl_destroy(&zv->zv_znode.z_range_avl);
+	mutex_destroy(&zv->zv_znode.z_range_lock);
 
 	ddi_soft_state_free(zvol_state, zv->zv_minor);
 
@@ -682,8 +688,10 @@
 zvol_get_done(dmu_buf_t *db, void *vzgd)
 {
 	zgd_t *zgd = (zgd_t *)vzgd;
+	rl_t *rl = zgd->zgd_rl;
 
 	dmu_buf_rele(db, vzgd);
+	zfs_range_unlock(rl);
 	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
 	kmem_free(zgd, sizeof (zgd_t));
 }
@@ -697,30 +705,42 @@
 	zvol_state_t *zv = arg;
 	objset_t *os = zv->zv_objset;
 	dmu_buf_t *db;
+	rl_t *rl;
 	zgd_t *zgd;
+	uint64_t boff; 			/* block starting offset */
+	int dlen = lr->lr_length;	/* length of user data */
 	int error;
 
 	ASSERT(zio);
-	ASSERT(lr->lr_length != 0);
+	ASSERT(dlen != 0);
 
-	if (buf != NULL)
-		return (dmu_read(os, ZVOL_OBJ,
-		    lr->lr_offset, lr->lr_length, buf));
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (buf != NULL) /* immediate write */
+		return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
 
 	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zv->zv_zilog;
 	zgd->zgd_bp = &lr->lr_blkptr;
 
+	/*
+	 * Lock the range of the block to ensure that when the data is
+	 * written out and it's checksum is being calculated that no other
+	 * thread can change the block.
+	 */
+	boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
+	rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
+	    RL_READER);
+	zgd->zgd_rl = rl;
+
 	VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
-	/*
-	 * Have to lock to ensure when when the data is
-	 * written out and it's checksum is being calculated
-	 * that no one can change the data.
-	 */
-	rw_enter(&zv->zv_dslock, RW_READER);
 	error = dmu_sync(zio, db, &lr->lr_blkptr,
 	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
-	rw_exit(&zv->zv_dslock);
 	if (error == 0)
 		zil_add_vdev(zv->zv_zilog,
 		    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
@@ -733,6 +753,7 @@
 	if (error == EINPROGRESS)
 		return (0);
 	dmu_buf_rele(db, zgd);
+	zfs_range_unlock(rl);
 	kmem_free(zgd, sizeof (zgd_t));
 	return (error);
 }
@@ -779,9 +800,10 @@
 	size_t size, resid;
 	char *addr;
 	objset_t *os;
+	rl_t *rl;
 	int error = 0;
 	int sync;
-	int reading;
+	boolean_t reading;
 
 	if (zv == NULL) {
 		bioerror(bp, ENXIO);
@@ -819,10 +841,8 @@
 	 * A better approach than a per zvol rwlock would be to lock ranges.
 	 */
 	reading = bp->b_flags & B_READ;
-	if (reading || resid <= zvol_immediate_write_sz)
-		rw_enter(&zv->zv_dslock, RW_READER);
-	else
-		rw_enter(&zv->zv_dslock, RW_WRITER);
+	rl = zfs_range_lock(&zv->zv_znode, off, resid,
+	    reading ? RL_READER : RL_WRITER);
 
 	while (resid != 0 && off < volsize) {
 
@@ -851,7 +871,7 @@
 		addr += size;
 		resid -= size;
 	}
-	rw_exit(&zv->zv_dslock);
+	zfs_range_unlock(rl);
 
 	if ((bp->b_resid = resid) == bp->b_bcount)
 		bioerror(bp, off > volsize ? EINVAL : error);
@@ -884,8 +904,11 @@
 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 {
 	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev));
+	rl_t *rl;
 	int error = 0;
 
+	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+	    RL_READER);
 	while (uio->uio_resid > 0) {
 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
 
@@ -893,6 +916,7 @@
 		if (error)
 			break;
 	}
+	zfs_range_unlock(rl);
 	return (error);
 }
 
@@ -901,8 +925,11 @@
 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 {
 	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev));
+	rl_t *rl;
 	int error = 0;
 
+	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
+	    RL_WRITER);
 	while (uio->uio_resid > 0) {
 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
 		uint64_t off = uio->uio_loffset;
@@ -922,6 +949,7 @@
 		if (error)
 			break;
 	}
+	zfs_range_unlock(rl);
 	return (error);
 }

author	perrin
	Sat, 03 Mar 2007 08:20:50 -0800
changeset 3755	8708c35cb823
parent 3754	79eeec53e95c
child 3756	fb8fddb52b9f

usr/src/uts/common/fs/zfs/zfs_rlock.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/zvol.c		file \| annotate \| diff \| comparison \| revisions