6875779 zfs user accounting callbacks can be simplified
authorMatthew Ahrens <Matthew.Ahrens@Sun.COM>
Fri, 28 Aug 2009 13:57:58 -0700
changeset 10407 34e10c4af053
parent 10406 fd30909fc9f8
child 10408 64c355043d3b
6875779 zfs user accounting callbacks can be simplified 6771468 ::blkptr prints incorrectly on 32-bit 6832861 zcmd_alloc_dst_nvlist's default size is too small 6876808 want ::refcount to print refcount_t details
usr/src/cmd/mdb/common/modules/zfs/zfs.c
usr/src/lib/libzfs/common/libzfs_util.c
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/zap.h
usr/src/uts/common/fs/zfs/zap.c
usr/src/uts/common/fs/zfs/zfs_vfsops.c
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Fri Aug 28 13:57:58 2009 -0700
@@ -36,6 +36,7 @@
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio_compress.h>
+#include <ctype.h>
 
 #ifndef _KERNEL
 #include "../genunix/list.h"
@@ -450,7 +451,7 @@
 		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
 		mdb_printf("DVA[%d]:       GANG: %-5s  GRID:  %04x\t"
 		    "ASIZE: %llx\n", i, DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
-		    DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
+		    (int)DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
 		mdb_printf("DVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", i,
 		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), BP_GET_PSIZE(&bp),
 		    BP_SHOULD_BYTESWAP(&bp) ? "e" : "",
@@ -464,7 +465,7 @@
 	    BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
 	    doti[BP_GET_TYPE(&bp)].ot_name);
 	mdb_printf("BIRTH:  %-16llx   LEVEL: %-2d\tFILL:  %llx\n",
-	    bp.blk_birth, BP_GET_LEVEL(&bp), bp.blk_fill);
+	    bp.blk_birth, (int)BP_GET_LEVEL(&bp), bp.blk_fill);
 	mdb_printf("CKFUNC: %-16s\t\tCOMP:  %s\n",
 	    zci[BP_GET_CHECKSUM(&bp)].ci_name,
 	    zct[BP_GET_COMPRESS(&bp)].ci_name);
@@ -2145,6 +2146,114 @@
 	return (DCMD_OK);
 }
 
+/* ARGSUSED */
+static int
+reference_cb(uintptr_t addr, const void *ignored, void *arg)
+{
+	static int gotid;
+	static mdb_ctf_id_t ref_id;
+	uintptr_t ref_holder;
+	uintptr_t ref_removed;
+	uint64_t ref_number;
+	boolean_t holder_is_str;
+	char holder_str[128];
+	boolean_t removed = (boolean_t)arg;
+
+	if (!gotid) {
+		if (mdb_ctf_lookup_by_name("struct reference", &ref_id) == -1) {
+			mdb_warn("couldn't find struct reference");
+			return (WALK_ERR);
+		}
+		gotid = TRUE;
+	}
+
+	if (GETMEMBID(addr, &ref_id, ref_holder, ref_holder) ||
+	    GETMEMBID(addr, &ref_id, ref_removed, ref_removed) ||
+	    GETMEMBID(addr, &ref_id, ref_number, ref_number))
+		return (WALK_ERR);
+
+	if (mdb_readstr(holder_str, sizeof (holder_str), ref_holder) != -1) {
+		char *cp;
+		holder_is_str = B_TRUE;
+		for (cp = holder_str; *cp; cp++) {
+			if (!isprint(*cp)) {
+				holder_is_str = B_FALSE;
+				break;
+			}
+		}
+	} else {
+		holder_is_str = B_FALSE;
+	}
+
+	if (removed)
+		mdb_printf("removed ");
+	mdb_printf("reference ");
+	if (ref_number != 1)
+		mdb_printf("with count=%llu ", ref_number);
+	mdb_printf("with tag %p", (void*)ref_holder);
+	if (holder_is_str)
+		mdb_printf(" \"%s\"", holder_str);
+	mdb_printf(", held at:\n");
+
+	(void) mdb_call_dcmd("whatis", addr, DCMD_ADDRSPEC, 0, NULL);
+
+	if (removed) {
+		mdb_printf("removed at:\n");
+		(void) mdb_call_dcmd("whatis", ref_removed,
+		    DCMD_ADDRSPEC, 0, NULL);
+	}
+
+	mdb_printf("\n");
+
+	return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static int
+refcount(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	uint64_t rc_count, rc_removed_count;
+	uintptr_t rc_list, rc_removed;
+	static int gotid;
+	static mdb_ctf_id_t rc_id;
+	ulong_t off;
+
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	if (!gotid) {
+		if (mdb_ctf_lookup_by_name("struct refcount", &rc_id) == -1) {
+			mdb_warn("couldn't find struct refcount");
+			return (DCMD_ERR);
+		}
+		gotid = TRUE;
+	}
+
+	if (GETMEMBID(addr, &rc_id, rc_count, rc_count) ||
+	    GETMEMBID(addr, &rc_id, rc_removed_count, rc_removed_count))
+		return (DCMD_ERR);
+
+	mdb_printf("refcount_t at %p has %llu current holds, "
+	    "%llu recently released holds\n",
+	    addr, (longlong_t)rc_count, (longlong_t)rc_removed_count);
+
+	if (rc_count > 0)
+		mdb_printf("current holds:\n");
+	if (mdb_ctf_offsetof(rc_id, "rc_list", &off) == -1)
+		return (DCMD_ERR);
+	rc_list = addr + off/NBBY;
+	mdb_pwalk("list", reference_cb, (void*)B_FALSE, rc_list);
+
+	if (rc_removed_count > 0)
+		mdb_printf("released holds:\n");
+	if (mdb_ctf_offsetof(rc_id, "rc_removed", &off) == -1)
+		return (DCMD_ERR);
+	rc_removed = addr + off/NBBY;
+	mdb_pwalk("list", reference_cb, (void*)B_TRUE, rc_removed);
+
+	return (DCMD_OK);
+}
+
 /*
  * MDB module linkage information:
  *
@@ -2186,6 +2295,7 @@
 	    "given a spa_t, print block type stats from last scrub",
 	    zfs_blkstats },
 	{ "zfs_params", "", "print zfs tunable parameters", zfs_params },
+	{ "refcount", "", "print refcount_t holders", refcount },
 	{ NULL }
 };
 
--- a/usr/src/lib/libzfs/common/libzfs_util.c	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_util.c	Fri Aug 28 13:57:58 2009 -0700
@@ -688,7 +688,7 @@
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
 	if (len == 0)
-		len = 2048;
+		len = 4*1024;
 	zc->zc_nvlist_dst_size = len;
 	if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
 	    zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL)
--- a/usr/src/uts/common/fs/zfs/arc.c	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Fri Aug 28 13:57:58 2009 -0700
@@ -2842,41 +2842,6 @@
 	return (0);
 }
 
-/*
- * arc_read() variant to support pool traversal.  If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_mtx;
-	uint64_t guid = spa_guid(spa);
-	int rc = 0;
-
-	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
-	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
-		arc_buf_t *buf = hdr->b_buf;
-
-		ASSERT(buf);
-		while (buf->b_data == NULL) {
-			buf = buf->b_next;
-			ASSERT(buf);
-		}
-		bcopy(buf->b_data, data, hdr->b_size);
-	} else {
-		rc = ENOENT;
-	}
-
-	if (hash_mtx)
-		mutex_exit(hash_mtx);
-
-	return (rc);
-}
-
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Aug 28 13:57:58 2009 -0700
@@ -1017,18 +1017,39 @@
 	    os->os_userused_dnode);
 }
 
+static void
+do_userquota_callback(objset_t *os, dnode_phys_t *dnp,
+    boolean_t subtract, dmu_tx_t *tx)
+{
+	static const char zerobuf[DN_MAX_BONUSLEN] = {0};
+	uint64_t user, group;
+
+	ASSERT(dnp->dn_type != 0 ||
+	    (bcmp(DN_BONUS(dnp), zerobuf, DN_MAX_BONUSLEN) == 0 &&
+	    DN_USED_BYTES(dnp) == 0));
+
+	if ((dnp->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) &&
+	    0 == used_cbs[os->os_phys->os_type](dnp->dn_bonustype,
+	    DN_BONUS(dnp), &user, &group)) {
+		int64_t delta = DNODE_SIZE + DN_USED_BYTES(dnp);
+		if (subtract)
+			delta = -delta;
+		VERIFY(0 == zap_increment_int(os, DMU_USERUSED_OBJECT,
+		    user, delta, tx));
+		VERIFY(0 == zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+		    group, delta, tx));
+	}
+}
+
 void
 dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	list_t *list = &os->os_synced_dnodes;
-	static const char zerobuf[DN_MAX_BONUSLEN] = {0};
 
 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
 	while (dn = list_head(list)) {
-		dmu_object_type_t bonustype;
-
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
 		ASSERT(dn->dn_oldphys);
 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
@@ -1046,31 +1067,14 @@
 		}
 
 		/*
-		 * If the object was not previously
-		 * accounted, pretend that it was free.
-		 */
-		if (!(dn->dn_oldphys->dn_flags &
-		    DNODE_FLAG_USERUSED_ACCOUNTED)) {
-			bzero(dn->dn_oldphys, sizeof (dnode_phys_t));
-		}
-
-		/*
-		 * If the object was freed, use the previous bonustype.
+		 * We intentionally modify the zap object even if the
+		 * net delta (due to phys-oldphys) is zero.  Otherwise
+		 * the block of the zap obj could be shared between
+		 * datasets but need to be different between them after
+		 * a bprewrite.
 		 */
-		bonustype = dn->dn_phys->dn_bonustype ?
-		    dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype;
-		ASSERT(dn->dn_phys->dn_type != 0 ||
-		    (bcmp(DN_BONUS(dn->dn_phys), zerobuf,
-		    DN_MAX_BONUSLEN) == 0 &&
-		    DN_USED_BYTES(dn->dn_phys) == 0));
-		ASSERT(dn->dn_oldphys->dn_type != 0 ||
-		    (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf,
-		    DN_MAX_BONUSLEN) == 0 &&
-		    DN_USED_BYTES(dn->dn_oldphys) == 0));
-		used_cbs[os->os_phys->os_type](os, bonustype,
-		    DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys),
-		    DN_USED_BYTES(dn->dn_oldphys),
-		    DN_USED_BYTES(dn->dn_phys), tx);
+		do_userquota_callback(os, dn->dn_oldphys, B_TRUE, tx);
+		do_userquota_callback(os, dn->dn_phys, B_FALSE, tx);
 
 		/*
 		 * The mutex is needed here for interlock with dnode_allocate.
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Aug 28 13:57:58 2009 -0700
@@ -120,7 +120,6 @@
     int zio_flags, const zbookmark_t *zb);
 int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
 
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 int arc_buf_evict(arc_buf_t *buf);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri Aug 28 13:57:58 2009 -0700
@@ -581,9 +581,8 @@
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
-typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
-    void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
-    dmu_tx_t *tx);
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+    void *bonus, uint64_t *userp, uint64_t *groupp);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
--- a/usr/src/uts/common/fs/zfs/sys/zap.h	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h	Fri Aug 28 13:57:58 2009 -0700
@@ -255,6 +255,8 @@
 int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx);
 
 struct zap;
 struct zap_leaf;
--- a/usr/src/uts/common/fs/zfs/zap.c	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zap.c	Fri Aug 28 13:57:58 2009 -0700
@@ -978,6 +978,30 @@
 	return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx)
+{
+	char name[20];
+	uint64_t value = 0;
+	int err;
+
+	if (delta == 0)
+		return (0);
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	err = zap_lookup(os, obj, name, 8, 1, &value);
+	if (err != 0 && err != ENOENT)
+		return (err);
+	value += delta;
+	if (value == 0)
+		err = zap_remove(os, obj, name, tx);
+	else
+		err = zap_update(os, obj, name, 8, 1, &value, tx);
+	return (err);
+}
+
+
 /*
  * Routines for iterating over the attributes.
  */
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Fri Aug 28 11:22:11 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Fri Aug 28 13:57:58 2009 -0700
@@ -594,36 +594,18 @@
 	ASSERT(err == 0);
 }
 
-static void
-zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
-    void *oldbonus, void *newbonus,
-    uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus,
+    uint64_t *userp, uint64_t *groupp)
 {
-	znode_phys_t *oldznp = oldbonus;
-	znode_phys_t *newznp = newbonus;
+	znode_phys_t *znp = bonus;
 
 	if (bonustype != DMU_OT_ZNODE)
-		return;
-
-	/* We charge 512 for the dnode (if it's allocated). */
-	if (oldznp->zp_gen != 0)
-		oldused += DNODE_SIZE;
-	if (newznp->zp_gen != 0)
-		newused += DNODE_SIZE;
+		return (ENOENT);
 
-	if (oldznp->zp_uid == newznp->zp_uid) {
-		uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
-	} else {
-		uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
-		uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
-	}
-
-	if (oldznp->zp_gid == newznp->zp_gid) {
-		uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
-	} else {
-		uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
-		uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
-	}
+	*userp = znp->zp_uid;
+	*groupp = znp->zp_gid;
+	return (0);
 }
 
 static void