6716117 ZFS needs native system attribute infrastructure
6516171 zpl symlinks should have their own object type
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c Tue Mar 16 09:43:38 2010 -0600
@@ -38,6 +38,8 @@
#include <sys/zap_leaf.h>
#include <sys/zap_impl.h>
#include <ctype.h>
+#include <sys/zfs_acl.h>
+#include <sys/sa_impl.h>
#ifndef _KERNEL
#include "../genunix/list.h"
@@ -217,7 +219,7 @@
objset_name(uintptr_t addr, char *buf)
{
static int gotid;
- static mdb_ctf_id_t os_id, ds_id;
+ static mdb_ctf_id_t osi_id, ds_id;
uintptr_t os_dsl_dataset;
char ds_snapname[MAXNAMELEN];
uintptr_t ds_dir;
@@ -225,9 +227,9 @@
buf[0] = '\0';
if (!gotid) {
- if (mdb_ctf_lookup_by_name("struct objset",
- &os_id) == -1) {
- mdb_warn("couldn't find struct objset");
+ if (mdb_ctf_lookup_by_name("struct objset_impl",
+ &osi_id) == -1) {
+ mdb_warn("couldn't find struct objset_impl");
return (DCMD_ERR);
}
if (mdb_ctf_lookup_by_name("struct dsl_dataset",
@@ -239,7 +241,7 @@
gotid = TRUE;
}
- if (GETMEMBID(addr, &os_id, os_dsl_dataset, os_dsl_dataset))
+ if (GETMEMBID(addr, &osi_id, os_dsl_dataset, os_dsl_dataset))
return (DCMD_ERR);
if (os_dsl_dataset == 0) {
@@ -429,7 +431,7 @@
(void) mdb_snprintf(objectname, sizeof (objectname), "%llx",
(u_longlong_t)db.db_object);
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
(void) strcpy(blkidname, "bonus");
else
(void) mdb_snprintf(blkidname, sizeof (blkidname), "%llx",
@@ -716,7 +718,7 @@
if (blkid) {
if (strcmp(blkid, "bonus") == 0) {
- data.blkid = DB_BONUS_BLKID;
+ data.blkid = DMU_BONUS_BLKID;
} else {
data.blkid = mdb_strtoull(blkid);
}
@@ -2291,6 +2293,602 @@
return (DCMD_OK);
}
+/* ARGSUSED */
+static int
+sa_attr_table(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ sa_attr_table_t *table;
+ sa_os_t sa_os;
+ char *name;
+ int i;
+
+ if (mdb_vread(&sa_os, sizeof (sa_os_t), addr) == -1) {
+ mdb_warn("failed to read sa_os at %p", addr);
+ return (DCMD_ERR);
+ }
+
+ table = mdb_alloc(sizeof (sa_attr_table_t) * sa_os.sa_num_attrs,
+ UM_SLEEP | UM_GC);
+ name = mdb_alloc(MAXPATHLEN, UM_SLEEP | UM_GC);
+
+ if (mdb_vread(table, sizeof (sa_attr_table_t) * sa_os.sa_num_attrs,
+ (uintptr_t)sa_os.sa_attr_table) == -1) {
+ mdb_warn("failed to read sa_os at %p", addr);
+ return (DCMD_ERR);
+ }
+
+ mdb_printf("%<u>%-10s %-10s %-10s %-10s %s%</u>\n",
+ "ATTR ID", "REGISTERED", "LENGTH", "BSWAP", "NAME");
+ for (i = 0; i != sa_os.sa_num_attrs; i++) {
+ mdb_readstr(name, MAXPATHLEN, (uintptr_t)table[i].sa_name);
+ mdb_printf("%5x %8x %8x %8x %-s\n",
+ (int)table[i].sa_attr, (int)table[i].sa_registered,
+ (int)table[i].sa_length, table[i].sa_byteswap, name);
+ }
+
+ return (DCMD_OK);
+}
+
+static int
+sa_get_off_table(uintptr_t addr, uint32_t **off_tab, int attr_count)
+{
+ uintptr_t idx_table;
+
+ if (GETMEMB(addr, struct sa_idx_tab, sa_idx_tab, idx_table)) {
+ mdb_printf("can't find offset table in sa_idx_tab\n");
+ return (-1);
+ }
+
+ *off_tab = mdb_alloc(attr_count * sizeof (uint32_t),
+ UM_SLEEP | UM_GC);
+
+ if (mdb_vread(*off_tab,
+ attr_count * sizeof (uint32_t), idx_table) == -1) {
+ mdb_warn("failed to attribute offset table %p", idx_table);
+ return (-1);
+ }
+
+ return (DCMD_OK);
+}
+
+/*ARGSUSED*/
+static int
+sa_attr_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ uint32_t *offset_tab;
+ int attr_count;
+ uint64_t attr_id;
+ uintptr_t attr_addr;
+ uintptr_t bonus_tab, spill_tab;
+ uintptr_t db_bonus, db_spill;
+ uintptr_t os, os_sa;
+ uintptr_t db_data;
+
+ if (argc != 1)
+ return (DCMD_USAGE);
+
+ if (argv[0].a_type == MDB_TYPE_STRING)
+ attr_id = mdb_strtoull(argv[0].a_un.a_str);
+ else
+ return (DCMD_USAGE);
+
+ if (GETMEMB(addr, struct sa_handle, sa_bonus_tab, bonus_tab) ||
+ GETMEMB(addr, struct sa_handle, sa_spill_tab, spill_tab) ||
+ GETMEMB(addr, struct sa_handle, sa_os, os) ||
+ GETMEMB(addr, struct sa_handle, sa_bonus, db_bonus) ||
+ GETMEMB(addr, struct sa_handle, sa_spill, db_spill)) {
+ mdb_printf("Can't find necessary information in sa_handle "
+ "in sa_handle\n");
+ return (DCMD_ERR);
+ }
+
+ if (GETMEMB(os, struct objset, os_sa, os_sa)) {
+ mdb_printf("Can't find os_sa in objset\n");
+ return (DCMD_ERR);
+ }
+
+ if (GETMEMB(os_sa, struct sa_os, sa_num_attrs, attr_count)) {
+ mdb_printf("Can't find sa_num_attrs\n");
+ return (DCMD_ERR);
+ }
+
+ if (attr_id > attr_count) {
+ mdb_printf("attribute id number is out of range\n");
+ return (DCMD_ERR);
+ }
+
+ if (bonus_tab) {
+ if (sa_get_off_table(bonus_tab, &offset_tab,
+ attr_count) == -1) {
+ return (DCMD_ERR);
+ }
+
+ if (GETMEMB(db_bonus, struct dmu_buf, db_data, db_data)) {
+ mdb_printf("can't find db_data in bonus dbuf\n");
+ return (DCMD_ERR);
+ }
+ }
+
+ if (bonus_tab && !TOC_ATTR_PRESENT(offset_tab[attr_id]) &&
+ spill_tab == NULL) {
+ mdb_printf("Attribute does not exist\n");
+ return (DCMD_ERR);
+ } else if (!TOC_ATTR_PRESENT(offset_tab[attr_id]) && spill_tab) {
+ if (sa_get_off_table(spill_tab, &offset_tab,
+ attr_count) == -1) {
+ return (DCMD_ERR);
+ }
+ if (GETMEMB(db_spill, struct dmu_buf, db_data, db_data)) {
+ mdb_printf("can't find db_data in spill dbuf\n");
+ return (DCMD_ERR);
+ }
+ if (!TOC_ATTR_PRESENT(offset_tab[attr_id])) {
+ mdb_printf("Attribute does not exist\n");
+ return (DCMD_ERR);
+ }
+ }
+ attr_addr = db_data + TOC_OFF(offset_tab[attr_id]);
+ mdb_printf("%p\n", attr_addr);
+ return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+zfs_ace_print_common(uintptr_t addr, uint_t flags,
+ uint64_t id, uint32_t access_mask, uint16_t ace_flags,
+ uint16_t ace_type, int verbose)
+{
+ if (DCMD_HDRSPEC(flags) && !verbose)
+ mdb_printf("%<u>%-?s %-8s %-8s %-8s %s%</u>\n",
+ "ADDR", "FLAGS", "MASK", "TYPE", "ID");
+
+ if (!verbose) {
+ mdb_printf("%0?p %-8x %-8x %-8x %-llx\n", addr,
+ ace_flags, access_mask, ace_type, id);
+ return (DCMD_OK);
+ }
+
+ switch (ace_flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ mdb_printf("owner@:");
+ break;
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ mdb_printf("group@:");
+ break;
+ case ACE_EVERYONE:
+ mdb_printf("everyone@:");
+ break;
+ case ACE_IDENTIFIER_GROUP:
+ mdb_printf("group:%llx:", (u_longlong_t)id);
+ break;
+ case 0: /* User entry */
+ mdb_printf("user:%llx:", (u_longlong_t)id);
+ break;
+ }
+
+ /* print out permission mask */
+ if (access_mask & ACE_READ_DATA)
+ mdb_printf("r");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_WRITE_DATA)
+ mdb_printf("w");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_EXECUTE)
+ mdb_printf("x");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_APPEND_DATA)
+ mdb_printf("p");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_DELETE)
+ mdb_printf("d");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_DELETE_CHILD)
+ mdb_printf("D");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_READ_ATTRIBUTES)
+ mdb_printf("a");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_WRITE_ATTRIBUTES)
+ mdb_printf("A");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_READ_NAMED_ATTRS)
+ mdb_printf("R");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_WRITE_NAMED_ATTRS)
+ mdb_printf("W");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_READ_ACL)
+ mdb_printf("c");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_WRITE_ACL)
+ mdb_printf("C");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_WRITE_OWNER)
+ mdb_printf("o");
+ else
+ mdb_printf("-");
+ if (access_mask & ACE_SYNCHRONIZE)
+ mdb_printf("s");
+ else
+ mdb_printf("-");
+
+ mdb_printf(":");
+
+ /* Print out inheritance flags */
+ if (ace_flags & ACE_FILE_INHERIT_ACE)
+ mdb_printf("f");
+ else
+ mdb_printf("-");
+ if (ace_flags & ACE_DIRECTORY_INHERIT_ACE)
+ mdb_printf("d");
+ else
+ mdb_printf("-");
+ if (ace_flags & ACE_INHERIT_ONLY_ACE)
+ mdb_printf("i");
+ else
+ mdb_printf("-");
+ if (ace_flags & ACE_NO_PROPAGATE_INHERIT_ACE)
+ mdb_printf("n");
+ else
+ mdb_printf("-");
+ if (ace_flags & ACE_SUCCESSFUL_ACCESS_ACE_FLAG)
+ mdb_printf("S");
+ else
+ mdb_printf("-");
+ if (ace_flags & ACE_FAILED_ACCESS_ACE_FLAG)
+ mdb_printf("F");
+ else
+ mdb_printf("-");
+ if (ace_flags & ACE_INHERITED_ACE)
+ mdb_printf("I");
+ else
+ mdb_printf("-");
+
+ switch (ace_type) {
+ case ACE_ACCESS_ALLOWED_ACE_TYPE:
+ mdb_printf(":allow\n");
+ break;
+ case ACE_ACCESS_DENIED_ACE_TYPE:
+ mdb_printf(":deny\n");
+ break;
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ mdb_printf(":audit\n");
+ break;
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ mdb_printf(":alarm\n");
+ break;
+ default:
+ mdb_printf(":?\n");
+ }
+ return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+zfs_ace_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ zfs_ace_t zace;
+ int verbose = FALSE;
+ uint64_t id;
+
+ if (!(flags & DCMD_ADDRSPEC))
+ return (DCMD_USAGE);
+
+ if (mdb_getopts(argc, argv,
+ 'v', MDB_OPT_SETBITS, TRUE, &verbose, TRUE, NULL) != argc)
+ return (DCMD_USAGE);
+
+ if (mdb_vread(&zace, sizeof (zfs_ace_t), addr) == -1) {
+ mdb_warn("failed to read zfs_ace_t");
+ return (DCMD_ERR);
+ }
+
+ if ((zace.z_hdr.z_flags & ACE_TYPE_FLAGS) == 0 ||
+ (zace.z_hdr.z_flags & ACE_TYPE_FLAGS) == ACE_IDENTIFIER_GROUP)
+ id = zace.z_fuid;
+ else
+ id = -1;
+
+ return (zfs_ace_print_common(addr, flags, id, zace.z_hdr.z_access_mask,
+ zace.z_hdr.z_flags, zace.z_hdr.z_type, verbose));
+}
+
+/* ARGSUSED */
+static int
+zfs_ace0_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ ace_t ace;
+ uint64_t id;
+ int verbose = FALSE;
+
+ if (!(flags & DCMD_ADDRSPEC))
+ return (DCMD_USAGE);
+
+ if (mdb_getopts(argc, argv,
+ 'v', MDB_OPT_SETBITS, TRUE, &verbose, TRUE, NULL) != argc)
+ return (DCMD_USAGE);
+
+ if (mdb_vread(&ace, sizeof (ace_t), addr) == -1) {
+ mdb_warn("failed to read ace_t");
+ return (DCMD_ERR);
+ }
+
+ if ((ace.a_flags & ACE_TYPE_FLAGS) == 0 ||
+ (ace.a_flags & ACE_TYPE_FLAGS) == ACE_IDENTIFIER_GROUP)
+ id = ace.a_who;
+ else
+ id = -1;
+
+ return (zfs_ace_print_common(addr, flags, id, ace.a_access_mask,
+ ace.a_flags, ace.a_type, verbose));
+}
+
+typedef struct acl_dump_args {
+ int a_argc;
+ const mdb_arg_t *a_argv;
+ uint16_t a_version;
+ int a_flags;
+} acl_dump_args_t;
+
+/* ARGSUSED */
+static int
+acl_aces_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+ acl_dump_args_t *acl_args = (acl_dump_args_t *)arg;
+
+ if (acl_args->a_version == 1) {
+ if (mdb_call_dcmd("zfs_ace", addr,
+ DCMD_ADDRSPEC|acl_args->a_flags, acl_args->a_argc,
+ acl_args->a_argv) != DCMD_OK) {
+ return (WALK_ERR);
+ }
+ } else {
+ if (mdb_call_dcmd("zfs_ace0", addr,
+ DCMD_ADDRSPEC|acl_args->a_flags, acl_args->a_argc,
+ acl_args->a_argv) != DCMD_OK) {
+ return (WALK_ERR);
+ }
+ }
+ acl_args->a_flags = DCMD_LOOP;
+ return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static int
+acl_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+ acl_dump_args_t *acl_args = (acl_dump_args_t *)arg;
+
+ if (acl_args->a_version == 1) {
+ if (mdb_pwalk("zfs_acl_node_aces", acl_aces_cb,
+ arg, addr) != 0) {
+ mdb_warn("can't walk ACEs");
+ return (DCMD_ERR);
+ }
+ } else {
+ if (mdb_pwalk("zfs_acl_node_aces0", acl_aces_cb,
+ arg, addr) != 0) {
+ mdb_warn("can't walk ACEs");
+ return (DCMD_ERR);
+ }
+ }
+ return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static int
+zfs_acl_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ zfs_acl_t zacl;
+ int verbose = FALSE;
+ acl_dump_args_t acl_args;
+
+ if (!(flags & DCMD_ADDRSPEC))
+ return (DCMD_USAGE);
+
+ if (mdb_getopts(argc, argv,
+ 'v', MDB_OPT_SETBITS, TRUE, &verbose, TRUE, NULL) != argc)
+ return (DCMD_USAGE);
+
+ if (mdb_vread(&zacl, sizeof (zfs_acl_t), addr) == -1) {
+ mdb_warn("failed to read zfs_acl_t");
+ return (DCMD_ERR);
+ }
+
+ acl_args.a_argc = argc;
+ acl_args.a_argv = argv;
+ acl_args.a_version = zacl.z_version;
+ acl_args.a_flags = DCMD_LOOPFIRST;
+
+ if (mdb_pwalk("zfs_acl_node", acl_cb, &acl_args, addr) != 0) {
+ mdb_warn("can't walk ACL");
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+zfs_acl_node_walk_init(mdb_walk_state_t *wsp)
+{
+ if (wsp->walk_addr == NULL) {
+ mdb_warn("must supply address of zfs_acl_node_t\n");
+ return (WALK_ERR);
+ }
+
+ wsp->walk_addr += OFFSETOF(zfs_acl_t, z_acl);
+
+ if (mdb_layered_walk("list", wsp) == -1) {
+ mdb_warn("failed to walk 'list'\n");
+ return (WALK_ERR);
+ }
+
+ return (WALK_NEXT);
+}
+
+static int
+zfs_acl_node_walk_step(mdb_walk_state_t *wsp)
+{
+ zfs_acl_node_t aclnode;
+
+ if (mdb_vread(&aclnode, sizeof (zfs_acl_node_t),
+ wsp->walk_addr) == -1) {
+ mdb_warn("failed to read zfs_acl_node at %p", wsp->walk_addr);
+ return (WALK_ERR);
+ }
+
+ return (wsp->walk_callback(wsp->walk_addr, &aclnode, wsp->walk_cbdata));
+}
+
+typedef struct ace_walk_data {
+ int ace_count;
+ int ace_version;
+} ace_walk_data_t;
+
+static int
+zfs_aces_walk_init_common(mdb_walk_state_t *wsp, int version,
+ int ace_count, uintptr_t ace_data)
+{
+ ace_walk_data_t *ace_walk_data;
+
+ if (wsp->walk_addr == NULL) {
+ mdb_warn("must supply address of zfs_acl_node_t\n");
+ return (WALK_ERR);
+ }
+
+ ace_walk_data = mdb_alloc(sizeof (ace_walk_data_t), UM_SLEEP | UM_GC);
+
+ ace_walk_data->ace_count = ace_count;
+ ace_walk_data->ace_version = version;
+
+ wsp->walk_addr = ace_data;
+ wsp->walk_data = ace_walk_data;
+
+ return (WALK_NEXT);
+}
+
+static int
+zfs_acl_node_aces_walk_init_common(mdb_walk_state_t *wsp, int version)
+{
+ static int gotid;
+ static mdb_ctf_id_t acl_id;
+ int z_ace_count;
+ uintptr_t z_acldata;
+
+ if (!gotid) {
+ if (mdb_ctf_lookup_by_name("struct zfs_acl_node",
+ &acl_id) == -1) {
+ mdb_warn("couldn't find struct zfs_acl_node");
+ return (DCMD_ERR);
+ }
+ gotid = TRUE;
+ }
+
+ if (GETMEMBID(wsp->walk_addr, &acl_id, z_ace_count, z_ace_count)) {
+ return (DCMD_ERR);
+ }
+ if (GETMEMBID(wsp->walk_addr, &acl_id, z_acldata, z_acldata)) {
+ return (DCMD_ERR);
+ }
+
+ return (zfs_aces_walk_init_common(wsp, version,
+ z_ace_count, z_acldata));
+}
+
+/* ARGSUSED */
+static int
+zfs_acl_node_aces_walk_init(mdb_walk_state_t *wsp)
+{
+ return (zfs_acl_node_aces_walk_init_common(wsp, 1));
+}
+
+/* ARGSUSED */
+static int
+zfs_acl_node_aces0_walk_init(mdb_walk_state_t *wsp)
+{
+ return (zfs_acl_node_aces_walk_init_common(wsp, 0));
+}
+
+static int
+zfs_aces_walk_step(mdb_walk_state_t *wsp)
+{
+ ace_walk_data_t *ace_data = wsp->walk_data;
+ zfs_ace_t zace;
+ ace_t *acep;
+ int status;
+ int entry_type;
+ int allow_type;
+ uintptr_t ptr;
+
+ if (ace_data->ace_count == 0)
+ return (WALK_DONE);
+
+ if (mdb_vread(&zace, sizeof (zfs_ace_t), wsp->walk_addr) == -1) {
+ mdb_warn("failed to read zfs_ace_t at %#lx",
+ wsp->walk_addr);
+ return (WALK_ERR);
+ }
+
+ switch (ace_data->ace_version) {
+ case 0:
+ acep = (ace_t *)&zace;
+ entry_type = acep->a_flags & ACE_TYPE_FLAGS;
+ allow_type = acep->a_type;
+ break;
+ case 1:
+ entry_type = zace.z_hdr.z_flags & ACE_TYPE_FLAGS;
+ allow_type = zace.z_hdr.z_type;
+ break;
+ default:
+ return (WALK_ERR);
+ }
+
+ ptr = (uintptr_t)wsp->walk_addr;
+ switch (entry_type) {
+ case ACE_OWNER:
+ case ACE_EVERYONE:
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ ptr += ace_data->ace_version == 0 ?
+ sizeof (ace_t) : sizeof (zfs_ace_hdr_t);
+ break;
+ case ACE_IDENTIFIER_GROUP:
+ default:
+ switch (allow_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ ptr += ace_data->ace_version == 0 ?
+ sizeof (ace_t) : sizeof (zfs_object_ace_t);
+ break;
+ default:
+ ptr += ace_data->ace_version == 0 ?
+ sizeof (ace_t) : sizeof (zfs_ace_t);
+ break;
+ }
+ }
+
+ ace_data->ace_count--;
+ status = wsp->walk_callback(wsp->walk_addr,
+ (void *)(uintptr_t)&zace, wsp->walk_cbdata);
+
+ wsp->walk_addr = ptr;
+ return (status);
+}
+
/*
* MDB module linkage information:
*
@@ -2304,7 +2902,7 @@
{ "dbuf", ":", "print dmu_buf_impl_t", dbuf },
{ "dbuf_stats", ":", "dbuf stats", dbuf_stats },
{ "dbufs",
- "\t[-O objset_t*] [-n objset_name | \"mos\"] "
+ "\t[-O objset_impl_t*] [-n objset_name | \"mos\"] "
"[-o object | \"mdn\"] \n"
"\t[-l level] [-b blkid | \"bonus\"]",
"find dmu_buf_impl_t's that match specified criteria", dbufs },
@@ -2333,6 +2931,14 @@
{ "zfs_params", "", "print zfs tunable parameters", zfs_params },
{ "refcount", "", "print refcount_t holders", refcount },
{ "zap_leaf", "", "print zap_leaf_phys_t", zap_leaf },
+ { "zfs_aces", ":[-v]", "print all ACEs from a zfs_acl_t",
+ zfs_acl_dump },
+ { "zfs_ace", ":[-v]", "print zfs_ace", zfs_ace_print },
+ { "zfs_ace0", ":[-v]", "print zfs_ace0", zfs_ace0_print },
+ { "sa_attr_table", ":", "print SA attribute table from sa_os_t",
+ sa_attr_table},
+ { "sa_attr", ": attr_id",
+ "print SA attribute address when given sa_handle_t", sa_attr_print},
{ NULL }
};
@@ -2366,6 +2972,13 @@
spa_walk_init, spa_walk_step, NULL },
{ "metaslab", "given a spa_t *, walk all metaslab_t structures",
metaslab_walk_init, metaslab_walk_step, NULL },
+ { "zfs_acl_node", "given a zfs_acl_t, walk all zfs_acl_nodes",
+ zfs_acl_node_walk_init, zfs_acl_node_walk_step, NULL },
+ { "zfs_acl_node_aces", "given a zfs_acl_node_t, walk all ACEs",
+ zfs_acl_node_aces_walk_init, zfs_aces_walk_step, NULL },
+ { "zfs_acl_node_aces0",
+ "given a zfs_acl_node_t, walk all ACEs as ace_t",
+ zfs_acl_node_aces0_walk_init, zfs_aces_walk_step, NULL },
{ NULL }
};
--- a/usr/src/cmd/zdb/zdb.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/cmd/zdb/zdb.c Tue Mar 16 09:43:38 2010 -0600
@@ -34,6 +34,9 @@
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab_impl.h>
@@ -370,6 +373,71 @@
/*ARGSUSED*/
static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = ", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+ (void) printf(" %llx : [%d:%d:%d]\n",
+ (u_longlong_t)attr.za_first_integer,
+ (int)ATTR_LENGTH(attr.za_first_integer),
+ (int)ATTR_BSWAP(attr.za_first_integer),
+ (int)ATTR_NUM(attr.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ uint16_t *layout_attrs;
+ int i;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = [", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+
+ VERIFY(attr.za_integer_length == 2);
+ layout_attrs = umem_zalloc(attr.za_num_integers *
+ attr.za_integer_length, UMEM_NOFAIL);
+
+ VERIFY(zap_lookup(os, object, attr.za_name,
+ attr.za_integer_length,
+ attr.za_num_integers, layout_attrs) == 0);
+
+ for (i = 0; i != attr.za_num_integers; i++)
+ (void) printf(" %d ", (int)layout_attrs[i]);
+ (void) printf("]\n");
+ umem_free(layout_attrs,
+ attr.za_num_integers * attr.za_integer_length);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
{
zap_cursor_t zc;
@@ -1106,6 +1174,8 @@
static avl_tree_t idx_tree;
static avl_tree_t domain_tree;
static boolean_t fuid_table_loaded;
+static boolean_t sa_loaded;
+sa_attr_type_t *sa_attr_table;
static void
fuid_table_destroy()
@@ -1138,12 +1208,12 @@
}
static void
-dump_uidgid(objset_t *os, znode_phys_t *zp)
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
{
uint32_t uid_idx, gid_idx;
- uid_idx = FUID_INDEX(zp->zp_uid);
- gid_idx = FUID_INDEX(zp->zp_gid);
+ uid_idx = FUID_INDEX(uid);
+ gid_idx = FUID_INDEX(gid);
/* Load domain table, if not already loaded */
if (!fuid_table_loaded && (uid_idx || gid_idx)) {
@@ -1158,50 +1228,103 @@
fuid_table_loaded = B_TRUE;
}
- print_idstr(zp->zp_uid, "uid");
- print_idstr(zp->zp_gid, "gid");
+ print_idstr(uid, "uid");
+ print_idstr(gid, "gid");
}
/*ARGSUSED*/
static void
dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
{
- znode_phys_t *zp = data;
+ char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
+ sa_handle_t *hdl;
+ uint64_t xattr, rdev, gen;
+ uint64_t uid, gid, mode, fsize, parent, links;
+ uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
time_t z_crtime, z_atime, z_mtime, z_ctime;
- char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
+ sa_bulk_attr_t bulk[11];
+ int idx = 0;
int error;
- ASSERT(size >= sizeof (znode_phys_t));
+ if (!sa_loaded) {
+ uint64_t sa_attrs = 0;
+ uint64_t version;
+
+ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &version) == 0);
+ if (version >= ZPL_VERSION_SA) {
+ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+ 8, 1, &sa_attrs) == 0);
+ }
+ sa_attr_table = sa_setup(os, sa_attrs,
+ zfs_attr_table, ZPL_END);
+ sa_loaded = B_TRUE;
+ }
+
+ if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+ (void) printf("Failed to get handle for SA znode\n");
+ return;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+ &links, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+ &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+ &fsize, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+ acctm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+ modtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+ crtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+ chgtm, 16);
+
+ if (sa_bulk_lookup(hdl, bulk, idx)) {
+ (void) sa_handle_destroy(hdl);
+ return;
+ }
error = zfs_obj_to_path(os, object, path, sizeof (path));
if (error != 0) {
(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
(u_longlong_t)object);
}
-
if (dump_opt['d'] < 3) {
(void) printf("\t%s\n", path);
+ (void) sa_handle_destroy(hdl);
return;
}
- z_crtime = (time_t)zp->zp_crtime[0];
- z_atime = (time_t)zp->zp_atime[0];
- z_mtime = (time_t)zp->zp_mtime[0];
- z_ctime = (time_t)zp->zp_ctime[0];
+ z_crtime = (time_t)crtm[0];
+ z_atime = (time_t)acctm[0];
+ z_mtime = (time_t)modtm[0];
+ z_ctime = (time_t)chgtm[0];
(void) printf("\tpath %s\n", path);
- dump_uidgid(os, zp);
+ dump_uidgid(os, uid, gid);
(void) printf("\tatime %s", ctime(&z_atime));
(void) printf("\tmtime %s", ctime(&z_mtime));
(void) printf("\tctime %s", ctime(&z_ctime));
(void) printf("\tcrtime %s", ctime(&z_crtime));
- (void) printf("\tgen %llu\n", (u_longlong_t)zp->zp_gen);
- (void) printf("\tmode %llo\n", (u_longlong_t)zp->zp_mode);
- (void) printf("\tsize %llu\n", (u_longlong_t)zp->zp_size);
- (void) printf("\tparent %llu\n", (u_longlong_t)zp->zp_parent);
- (void) printf("\tlinks %llu\n", (u_longlong_t)zp->zp_links);
- (void) printf("\txattr %llu\n", (u_longlong_t)zp->zp_xattr);
- (void) printf("\trdev 0x%016llx\n", (u_longlong_t)zp->zp_rdev);
+ (void) printf("\tgen %llu\n", (u_longlong_t)gen);
+ (void) printf("\tmode %llo\n", (u_longlong_t)mode);
+ (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
+ (void) printf("\tparent %llu\n", (u_longlong_t)parent);
+ (void) printf("\tlinks %llu\n", (u_longlong_t)links);
+ if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
+ if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
+ sa_handle_destroy(hdl);
}
/*ARGSUSED*/
@@ -1261,7 +1384,11 @@
dump_zap, /* snapshot refcount tags */
dump_ddt_zap, /* DDT ZAP object */
dump_zap, /* DDT statistics */
- dump_unknown /* Unknown type, must be last */
+ dump_znode, /* SA object */
+ dump_zap, /* SA Master Node */
+ dump_sa_attrs, /* SA attribute registration */
+ dump_sa_layouts, /* SA attribute layouts */
+ dump_unknown, /* Unknown type, must be last */
};
static void
@@ -1328,11 +1455,13 @@
}
if (verbosity >= 4) {
- (void) printf("\tdnode flags: %s%s\n",
+ (void) printf("\tdnode flags: %s%s%s\n",
(dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
"USED_BYTES " : "",
(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
- "USERUSED_ACCOUNTED " : "");
+ "USERUSED_ACCOUNTED " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+ "SPILL_BLKPTR" : "");
(void) printf("\tdnode maxblkid: %llu\n",
(longlong_t)dn->dn_phys->dn_maxblkid);
@@ -1685,6 +1814,7 @@
dump_dir(os);
dmu_objset_disown(os, FTAG);
fuid_table_destroy();
+ sa_loaded = B_FALSE;
return (0);
}
@@ -2961,6 +3091,7 @@
(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
fuid_table_destroy();
+ sa_loaded = B_FALSE;
libzfs_fini(g_zfs);
kernel_fini();
--- a/usr/src/cmd/zfs/Makefile Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/cmd/zfs/Makefile Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -39,10 +39,12 @@
LDLIBS += -lzfs -luutil -lumem -lnvpair
+INCS += -I../../common/zfs
+
C99MODE= -xc99=%all
C99LMODE= -Xc99=%all
-CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT
+CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT $(INCS)
$(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG
# lint complains about unused _umem_* functions
--- a/usr/src/cmd/zfs/zfs_main.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/cmd/zfs/zfs_main.c Tue Mar 16 09:43:38 2010 -0600
@@ -53,6 +53,7 @@
#include "zfs_iter.h"
#include "zfs_util.h"
+#include "zfs_comutil.h"
libzfs_handle_t *g_zfs;
@@ -1594,31 +1595,25 @@
{
upgrade_cbdata_t *cb = data;
int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
- int i;
- static struct { int zplver; int spaver; } table[] = {
- {ZPL_VERSION_FUID, SPA_VERSION_FUID},
- {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
- {0, 0}
- };
-
-
- for (i = 0; table[i].zplver; i++) {
- if (cb->cb_version >= table[i].zplver) {
- int spa_version;
-
- if (zfs_spa_version(zhp, &spa_version) < 0)
- return (-1);
-
- if (spa_version < table[i].spaver) {
- /* can't upgrade */
- (void) printf(gettext("%s: can not be "
- "upgraded; the pool version needs to first "
- "be upgraded\nto version %d\n\n"),
- zfs_get_name(zhp), table[i].spaver);
- cb->cb_numfailed++;
- return (0);
- }
- }
+ int needed_spa_version;
+ int spa_version;
+
+ if (zfs_spa_version(zhp, &spa_version) < 0)
+ return (-1);
+
+ needed_spa_version = zfs_spa_version_map(cb->cb_version);
+
+ if (needed_spa_version < 0)
+ return (-1);
+
+ if (spa_version < needed_spa_version) {
+ /* can't upgrade */
+ (void) printf(gettext("%s: can not be "
+ "upgraded; the pool version needs to first "
+ "be upgraded\nto version %d\n\n"),
+ zfs_get_name(zhp), needed_spa_version);
+ cb->cb_numfailed++;
+ return (0);
}
/* upgrade */
@@ -1720,6 +1715,7 @@
"unique identifier (FUID)\n"));
(void) printf(gettext(" 4 userquota, groupquota "
"properties\n"));
+ (void) printf(gettext(" 5 System attributes\n"));
(void) printf(gettext("\nFor more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
--- a/usr/src/cmd/zpool/zpool_main.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c Tue Mar 16 09:43:38 2010 -0600
@@ -3889,6 +3889,7 @@
(void) printf(gettext(" 21 Deduplication\n"));
(void) printf(gettext(" 22 Received properties\n"));
(void) printf(gettext(" 23 Slim ZIL\n"));
+ (void) printf(gettext(" 24 System attributes\n"));
(void) printf(gettext("\nFor more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
--- a/usr/src/cmd/zstreamdump/zstreamdump.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/cmd/zstreamdump/zstreamdump.c Tue Mar 16 09:43:38 2010 -0600
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -88,6 +88,7 @@
struct drr_write *drrw = &thedrr.drr_u.drr_write;
struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
struct drr_free *drrf = &thedrr.drr_u.drr_free;
+ struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
char c;
boolean_t verbose = B_FALSE;
boolean_t first = B_TRUE;
@@ -378,6 +379,18 @@
(longlong_t)drrf->drr_length);
}
break;
+ case DRR_SPILL:
+ if (do_byteswap) {
+ drrs->drr_object = BSWAP_64(drrs->drr_object);
+ drrs->drr_length = BSWAP_64(drrs->drr_length);
+ }
+ if (verbose) {
+ (void) printf("SPILL block for object = %llu "
+ "length = %llu\n", drrs->drr_object,
+ drrs->drr_length);
+ }
+ (void) ssread(buf, drrs->drr_length, &zc);
+ break;
}
pcksum = zc;
}
@@ -398,12 +411,15 @@
(u_longlong_t)drr_record_count[DRR_WRITE]);
(void) printf("\tTotal DRR_FREE records = %lld\n",
(u_longlong_t)drr_record_count[DRR_FREE]);
+ (void) printf("\tTotal DRR_SPILL records = %lld\n",
+ (u_longlong_t)drr_record_count[DRR_SPILL]);
(void) printf("\tTotal records = %lld\n",
(u_longlong_t)(drr_record_count[DRR_BEGIN] +
drr_record_count[DRR_OBJECT] +
drr_record_count[DRR_FREEOBJECTS] +
drr_record_count[DRR_WRITE] +
drr_record_count[DRR_FREE] +
+ drr_record_count[DRR_SPILL] +
drr_record_count[DRR_END]));
(void) printf("\tTotal write size = %lld (0x%llx)\n",
(u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
--- a/usr/src/common/zfs/zfs_comutil.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/common/zfs/zfs_comutil.c Tue Mar 16 09:43:38 2010 -0600
@@ -39,6 +39,7 @@
#include <sys/fs/zfs.h>
#include <sys/int_limits.h>
#include <sys/nvpair.h>
+#include "zfs_comutil.h"
/*
* Are there allocatable vdevs?
@@ -103,3 +104,56 @@
if (zrpp->zrp_request == 0)
zrpp->zrp_request = ZPOOL_NO_REWIND;
}
+
+typedef struct zfs_version_spa_map {
+ int version_zpl;
+ int version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+ {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+ {ZPL_VERSION_FUID, SPA_VERSION_FUID},
+ {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+ {ZPL_VERSION_SA, SPA_VERSION_SA},
+ {0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_spa; i++) {
+ if (spa_version >= zfs_version_table[i].version_spa)
+ version = zfs_version_table[i].version_zpl;
+ }
+
+ return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+ int i;
+ int version = -1;
+
+ for (i = 0; zfs_version_table[i].version_zpl; i++) {
+ if (zfs_version_table[i].version_zpl >= zpl_version)
+ return (zfs_version_table[i].version_spa);
+ }
+
+ return (version);
+}
--- a/usr/src/common/zfs/zfs_comutil.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/common/zfs/zfs_comutil.h Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,6 +36,9 @@
extern boolean_t zfs_allocatable_devs(nvlist_t *);
extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
+extern int zfs_zpl_version_map(int spa_version);
+extern int zfs_spa_version_map(int zpl_version);
+
#ifdef __cplusplus
}
#endif
--- a/usr/src/common/zfs/zfs_prop.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/common/zfs/zfs_prop.c Tue Mar 16 09:43:38 2010 -0600
@@ -153,6 +153,7 @@
{ "2", 2 },
{ "3", 3 },
{ "4", 4 },
+ { "5", 5 },
{ "current", ZPL_VERSION },
{ NULL }
};
--- a/usr/src/grub/capability Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/grub/capability Tue Mar 16 09:43:38 2010 -0600
@@ -40,7 +40,7 @@
# This file and the associated version are Solaris specific and are
# not a part of the open source distribution of GRUB.
#
-VERSION=15
+VERSION=16
dboot
xVM
zfs
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Tue Mar 16 09:43:38 2010 -0600
@@ -670,6 +670,7 @@
zapbuf = stack;
size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
stack += size;
+
if (errnum = dmu_read(zap_dnode, 0, zapbuf, stack))
return (errnum);
@@ -1425,7 +1426,44 @@
}
/* get the file size and set the file position to 0 */
- filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
+
+ /*
+ * For DMU_OT_SA we will need to locate the SIZE attribute
+ * attribute, which could be either in the bonus buffer
+ * or the "spill" block.
+ */
+ if (DNODE->dn_bonustype == DMU_OT_SA) {
+ sa_hdr_phys_t *sahdrp;
+ int hdrsize;
+
+ sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE);
+ if (DNODE->dn_bonuslen != 0) {
+ sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE);
+ } else {
+ if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ blkptr_t *bp = &DNODE->dn_spill;
+ void *buf;
+
+ buf = (void *)stack;
+ stack += BP_GET_LSIZE(bp);
+
+ /* reset errnum to rawread() failure */
+ errnum = 0;
+ if (zio_read(bp, buf, stack) != 0) {
+ return (0);
+ }
+ sahdrp = buf;
+ } else {
+ errnum = ERR_FSYS_CORRUPT;
+ return (0);
+ }
+ }
+ hdrsize = SA_HDR_SIZE(sahdrp);
+ filemax = *(uint64_t *)((char *)sahdrp + hdrsize +
+ SA_SIZE_OFFSET);
+ } else {
+ filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
+ }
filepos = 0;
dnode_buf = NULL;
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.h Tue Mar 16 09:43:38 2010 -0600
@@ -53,6 +53,7 @@
#include <zfs-include/dsl_dataset.h>
#include <zfs-include/zil.h>
#include <zfs-include/dmu_objset.h>
+#include <zfs-include/sa_impl.h>
/*
* Global Memory addresses to store MOS and DNODE data
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/dmu.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/dmu.h Tue Mar 16 09:43:38 2010 -0600
@@ -17,15 +17,13 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DMU_H
#define _SYS_DMU_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file describes the interface that the DMU provides for its
* consumers.
@@ -75,7 +73,22 @@
DMU_OT_SPA_HISTORY, /* UINT8 */
DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
DMU_OT_POOL_PROPS, /* ZAP */
-
+ DMU_OT_DSL_PERMS, /* ZAP */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_SYSACL, /* SYSACL */
+ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
+ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
+ DMU_OT_NEXT_CLONES, /* ZAP */
+ DMU_OT_SCRUB_QUEUE, /* ZAP */
+ DMU_OT_USERGROUP_USED, /* ZAP */
+ DMU_OT_USERGROUP_QUOTA, /* ZAP */
+ DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
DMU_OT_NUMTYPES
} dmu_object_type_t;
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/dnode.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/dnode.h Tue Mar 16 09:43:38 2010 -0600
@@ -17,15 +17,13 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_DNODE_H
#define _SYS_DNODE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Fixed constants.
*/
@@ -49,6 +47,8 @@
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
@@ -72,7 +72,8 @@
uint64_t dn_pad3[4];
blkptr_t dn_blkptr[1];
- uint8_t dn_bonus[DN_MAX_BONUSLEN];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+ blkptr_t dn_spill;
} dnode_phys_t;
#endif /* _SYS_DNODE_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/sa_impl.h Tue Mar 16 09:43:38 2010 -0600
@@ -0,0 +1,38 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SA_IMPL_H
+#define _SYS_SA_IMPL_H
+
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ uint16_t sa_layout_info;
+ uint16_t sa_lengths[1];
+} sa_hdr_phys_t;
+
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define SA_SIZE_OFFSET 0x8
+
+#endif /* _SYS_SA_IMPL_H */
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Tue Mar 16 09:43:38 2010 -0600
@@ -27,7 +27,7 @@
/*
* On-disk version number.
*/
-#define SPA_VERSION 23ULL
+#define SPA_VERSION 24ULL
/*
* The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs_znode.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs_znode.h Tue Mar 16 09:43:38 2010 -0600
@@ -17,7 +17,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,8 +27,9 @@
#define MASTER_NODE_OBJ 1
#define ZFS_ROOT_OBJ "ROOT"
#define ZPL_VERSION_STR "VERSION"
+#define ZFS_SA_ATTRS "SA_ATTRS"
-#define ZPL_VERSION 4ULL
+#define ZPL_VERSION 5ULL
#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
--- a/usr/src/lib/libzfs/common/libzfs_impl.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_impl.h Tue Mar 16 09:43:38 2010 -0600
@@ -30,7 +30,6 @@
#include <sys/dmu.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_ioctl.h>
-#include <sys/zfs_acl.h>
#include <sys/spa.h>
#include <sys/nvpair.h>
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c Tue Mar 16 09:43:38 2010 -0600
@@ -203,6 +203,7 @@
struct drr_end *drre = &thedrr.drr_u.drr_end;
struct drr_object *drro = &thedrr.drr_u.drr_object;
struct drr_write *drrw = &thedrr.drr_u.drr_write;
+ struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
FILE *ofp;
int outfd;
dmu_replay_record_t wbr_drr = {0};
@@ -302,6 +303,18 @@
break;
}
+ case DRR_SPILL:
+ {
+ if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+ &stream_cksum, outfd) == -1)
+ goto out;
+ (void) ssread(buf, drrs->drr_length, ofp);
+ if (cksum_and_write(buf, drrs->drr_length,
+ &stream_cksum, outfd) == -1)
+ goto out;
+ break;
+ }
+
case DRR_FREEOBJECTS:
{
if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
@@ -1154,6 +1167,14 @@
dedup_arg_t dda = { 0 };
int featureflags = 0;
+ if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
+ uint64_t version;
+ version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+ if (version >= ZPL_VERSION_SA) {
+ featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+ }
+ }
+
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot send '%s'"), zhp->zfs_name);
@@ -2180,7 +2201,14 @@
(void) recv_read(hdl, fd, buf,
drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
break;
-
+ case DRR_SPILL:
+ if (byteswap) {
+ drr->drr_u.drr_write.drr_length =
+ BSWAP_64(drr->drr_u.drr_spill.drr_length);
+ }
+ (void) recv_read(hdl, fd, buf,
+ drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
+ break;
case DRR_WRITE_BYREF:
case DRR_FREEOBJECTS:
case DRR_FREE:
--- a/usr/src/lib/libzfs/common/mapfile-vers Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/lib/libzfs/common/mapfile-vers Tue Mar 16 09:43:38 2010 -0600
@@ -134,6 +134,7 @@
zfs_smb_acl_rename;
zfs_snapshot;
zfs_spa_version;
+ zfs_spa_version_map;
zfs_type_to_name;
zfs_unmount;
zfs_unmountall;
@@ -146,6 +147,7 @@
zfs_unshareall_smb;
zfs_userspace;
zfs_userquota_prop_prefixes;
+ zfs_zpl_version_map;
zpool_add;
zpool_clear;
zpool_clear_label;
--- a/usr/src/lib/libzpool/common/kernel.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/lib/libzpool/common/kernel.c Tue Mar 16 09:43:38 2010 -0600
@@ -776,6 +776,17 @@
return (0);
}
+int
+ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
+{
+ char *end;
+
+ *result = strtoull(str, &end, base);
+ if (*result == 0)
+ return (errno);
+ return (0);
+}
+
/*
* =========================================================================
* kernel emulation setup & teardown
--- a/usr/src/lib/libzpool/common/llib-lzpool Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/lib/libzpool/common/llib-lzpool Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -50,6 +50,8 @@
#include <sys/dbuf.h>
#include <sys/zio_checksum.h>
#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h Tue Mar 16 09:43:38 2010 -0600
@@ -536,6 +536,9 @@
extern int ddi_strtoul(const char *str, char **nptr, int base,
unsigned long *result);
+extern int ddi_strtoull(const char *str, char **nptr, int base,
+ u_longlong_t *result);
+
/* ZFS Boot Related stuff. */
struct _buf {
--- a/usr/src/psm/stand/bootblks/zfs/common/zfs.fth Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/psm/stand/bootblks/zfs/common/zfs.fth Tue Mar 16 09:43:38 2010 -0600
@@ -19,13 +19,13 @@
\ CDDL HEADER END
\
\
-\ Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+\ Copyright 2010 Sun Microsystems, Inc. All rights reserved.
\ Use is subject to license terms.
\
purpose: ZFS file system support package
-copyright: Copyright 2009 Sun Microsystems, Inc. All Rights Reserved
+copyright: Copyright 2010 Sun Microsystems, Inc. All Rights Reserved
" /packages" get-package push-package
@@ -395,13 +395,18 @@
\ ZFS dnode (DMU) routines
\
+ d# 44 constant ot-sa#
+
d# 512 constant /dnode
- : dn_indblkshift ( dn -- n ) h# 1 + c@ ;
- : dn_nlevels ( dn -- n ) h# 2 + c@ ;
- : dn_datablkszsec ( dn -- n ) h# 8 + w@ ;
- : dn_blkptr ( dn -- p ) h# 40 + ;
- : dn_bonus ( dn -- p ) h# c0 + ;
+ : dn_indblkshift ( dn -- n ) h# 1 + c@ ;
+ : dn_nlevels ( dn -- n ) h# 2 + c@ ;
+ : dn_bonustype ( dn -- n ) h# 4 + c@ ;
+ : dn_datablkszsec ( dn -- n ) h# 8 + w@ ;
+ : dn_bonuslen ( dn -- n ) h# a + w@ ;
+ : dn_blkptr ( dn -- p ) h# 40 + ;
+ : dn_bonus ( dn -- p ) h# c0 + ;
+ : dn_spill ( dn -- p ) h# 180 + ;
0 instance value dnode
@@ -755,7 +760,6 @@
0 instance value mos-dn
0 instance value obj-dir
0 instance value root-dsl
- 0 instance value root-dsl#
0 instance value fs-dn
\ dn-cache contains dc-dn's contents at dc-blk#
@@ -819,7 +823,6 @@
obj-dir " root_dataset" zap-lookup if
" no root_dataset" die
then ( obj# )
- dup to root-dsl#
get-mos-dnode ( )
dnode root-dsl /dnode move
;
@@ -888,6 +891,20 @@
\
1 constant master-node#
+
+ 0 instance value bootfs-obj#
+ 0 instance value root-obj#
+ 0 instance value current-obj#
+ 0 instance value search-obj#
+
+ instance defer fsize ( dn -- size )
+ instance defer mode ( dn -- mode )
+ instance defer parent ( dn -- obj# )
+ instance defer readlink ( dst dn -- )
+
+ \
+ \ routines when bonus pool contains a znode
+ \
d# 264 constant /znode
d# 56 constant /zn-slink
@@ -895,15 +912,77 @@
: zp_size ( zn -- n ) h# 50 + x@ ;
: zp_parent ( zn -- n ) h# 58 + x@ ;
- 0 instance value bootfs-obj#
- 0 instance value root-obj#
- 0 instance value current-obj#
- 0 instance value search-obj#
-
alias >znode dn_bonus
- : fsize ( dn -- n ) >znode zp_size ;
- : ftype ( dn -- n ) >znode zp_mode h# f000 and ;
+ : zn-fsize ( dn -- n ) >znode zp_size ;
+ : zn-mode ( dn -- n ) >znode zp_mode ;
+ : zn-parent ( dn -- n ) >znode zp_parent ;
+
+ \ copy symlink target to dst
+ : zn-readlink ( dst dn -- )
+ dup zn-fsize tuck /zn-slink > if ( dst size dn )
+ \ contents in 1st block
+ temp-space over dn-bsize ( dst size dn t-adr bsize )
+ rot 0 lblk#>bp read-bp ( dst size )
+ temp-space ( dst size src )
+ else ( dst size dn )
+ \ contents in dnode
+ >znode /znode + ( dst size src )
+ then ( dst size src )
+ -rot move ( )
+ ;
+
+ \
+ \ routines when bonus pool contains sa's
+ \
+
+ \ SA header size when link is in dn_bonus
+ d# 16 constant /sahdr-link
+
+ : sa_props ( sa -- n ) h# 4 + w@ ;
+
+ : sa-hdrsz ( sa -- sz ) sa_props h# 7 >> ;
+
+ alias >sa dn_bonus
+
+ : >sadata ( dn -- adr ) >sa dup sa-hdrsz + ;
+ : sa-mode ( dn -- n ) >sadata x@ ;
+ : sa-fsize ( dn -- n ) >sadata h# 8 + x@ ;
+ : sa-parent ( dn -- n ) >sadata h# 28 + x@ ;
+
+ \ copy symlink target to dst
+ : sa-readlink ( dst dn -- )
+ dup >sa sa-hdrsz /sahdr-link <> if
+ \ contents in 1st attr of dn_spill
+ temp-space over dn_spill ( dst dn t-adr bp )
+ dup bp-lsize swap read-bp ( dst dn )
+ sa-fsize ( dst size )
+ temp-space dup sa-hdrsz + ( dst size src )
+ else ( dst dn )
+ \ content in bonus buf
+ dup dn_bonus over dn_bonuslen + ( dst dn ebonus )
+ swap sa-fsize tuck - ( dst size src )
+ then ( dst size src )
+ -rot move ( )
+ ;
+
+
+ \ setup attr routines for dn
+ : set-attr ( dn -- )
+ dn_bonustype ot-sa# = if
+ ['] sa-fsize to fsize
+ ['] sa-mode to mode
+ ['] sa-parent to parent
+ ['] sa-readlink to readlink
+ else
+ ['] zn-fsize to fsize
+ ['] zn-mode to mode
+ ['] zn-parent to parent
+ ['] zn-readlink to readlink
+ then
+ ;
+
+ : ftype ( dn -- type ) mode h# f000 and ;
: dir? ( dn -- flag ) ftype h# 4000 = ;
: symlink? ( dn -- flag ) ftype h# a000 = ;
@@ -959,7 +1038,7 @@
then
2dup " .." $= if
- 2drop >znode zp_parent ( obj# )
+ 2drop parent ( obj# )
else ( dn file$ )
\ search dir
current-obj# to search-obj#
@@ -967,38 +1046,32 @@
true exit ( not-found )
then ( obj# )
then ( obj# )
- get-fs-dnode false ( found )
+ get-fs-dnode
+ dnode set-attr
+ false ( found )
;
/buf-len instance buffer: fpath-buf
- : clr-fpath-buf ( -- ) fpath-buf /buf-len erase ;
-
- : fpath-buf$ ( -- path$ ) fpath-buf cscount ;
+ /buf-len instance buffer: tpath-buf
- \ copy symlink target to adr
- : readlink ( dst dn -- )
- dup fsize tuck /zn-slink > if ( dst size dn )
- \ contents in 1st block
- temp-space over dn-bsize ( dst size dn t-adr bsize )
- rot 0 lblk#>bp read-bp ( dst size )
- temp-space ( dst size src )
- else ( dst size dn )
- \ contents in dnode
- >znode /znode + ( dst size src )
- then ( dst size src )
- -rot move ( )
- ;
+ : tpath-buf$ ( -- path$ ) tpath-buf cscount ;
+ : fpath-buf$ ( -- path$ ) fpath-buf cscount ;
\ modify tail to account for symlink
: follow-symlink ( tail$ -- tail$' )
- clr-fpath-buf ( tail$ )
- fpath-buf dnode readlink
+ \ read target
+ tpath-buf /buf-len erase
+ tpath-buf dnode readlink
- \ append to current path
+ \ append current path
?dup if ( tail$ )
- " /" fpath-buf$ $append ( tail$ )
- fpath-buf$ $append ( )
+ " /" tpath-buf$ $append ( tail$ )
+ tpath-buf$ $append ( )
else drop then ( )
+
+ \ copy to fpath
+ fpath-buf /buf-len erase
+ tpath-buf$ fpath-buf swap move
fpath-buf$ ( path$ )
\ get directory that starts changed path
@@ -1008,6 +1081,7 @@
search-obj# ( path$ obj# )
then ( path$ obj# )
get-fs-dnode ( path$ )
+ dnode set-attr
;
\ open dnode at path
@@ -1020,6 +1094,7 @@
current-obj# ( path$ obj# )
then ( path$ obj# )
get-fs-dnode ( path$ )
+ dnode set-attr
\ lookup each path component
begin ( path$ )
@@ -1173,7 +1248,7 @@
\ zero instance buffers
file-records /file-records erase
- bootprop-buf /buf-len erase
+ bootprop-buf /buf-len erase
;
: release-buffers ( -- )
--- a/usr/src/uts/common/Makefile.files Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/Makefile.files Tue Mar 16 09:43:38 2010 -0600
@@ -1338,6 +1338,7 @@
lzjb.o \
metaslab.o \
refcount.o \
+ sa.o \
sha256.o \
spa.o \
spa_config.o \
@@ -1363,6 +1364,7 @@
zfs_byteswap.o \
zfs_fm.o \
zfs_fuid.o \
+ zfs_sa.o \
zfs_znode.o \
zil.o \
zio.o \
--- a/usr/src/uts/common/fs/zfs/dbuf.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Tue Mar 16 09:43:38 2010 -0600
@@ -34,6 +34,8 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
@@ -296,13 +298,17 @@
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
ASSERT3U(db->db_level, <, dn->dn_nlevels);
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
- list_head(&dn->dn_dbufs));
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid ==
+ DMU_SPILL_BLKID || list_head(&dn->dn_dbufs));
}
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+ ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+ } else if (db->db_blkid == DMU_SPILL_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, 0);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
}
@@ -336,8 +342,9 @@
ASSERT(db->db_parent == NULL);
else
ASSERT(db->db_parent != NULL);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
} else {
/* db is pointed to by an indirect block */
int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
@@ -357,7 +364,7 @@
}
}
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
- db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+ db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
db->db_state != DB_FILL && !dn->dn_free_txg) {
/*
* If the blkptr isn't set but they have nonzero data,
@@ -465,7 +472,7 @@
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
db->db_state = DB_UNCACHED;
@@ -490,7 +497,7 @@
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
ASSERT3U(bonuslen, <=, db->db.db_size);
@@ -570,7 +577,7 @@
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
- prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
DBUF_IS_CACHEABLE(db);
@@ -630,7 +637,7 @@
dbuf_noread(dmu_buf_impl_t *db)
{
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
@@ -675,7 +682,7 @@
if (dr == NULL ||
(dr->dt.dl.dr_data !=
- ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
return;
/*
@@ -686,7 +693,7 @@
* just null out the current db_data pointer.
*/
ASSERT(dr->dr_txg >= txg - 2);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -713,7 +720,7 @@
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
ASSERT(db->db_level == 0);
- if (db->db_blkid == DB_BONUS_BLKID ||
+ if (db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
return;
@@ -751,7 +758,7 @@
uint64_t first_l1 = start >> epbs;
uint64_t last_l1 = end >> epbs;
- if (end > dn->dn_maxblkid) {
+ if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
end = dn->dn_maxblkid;
last_l1 = end >> epbs;
}
@@ -759,7 +766,7 @@
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
if (db->db_level == 1 &&
db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
@@ -873,7 +880,7 @@
int osize = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* XXX does *this* func really need the lock? */
ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
@@ -970,6 +977,9 @@
}
mutex_exit(&dn->dn_mtx);
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ dn->dn_have_spill = B_TRUE;
+
/*
* If this buffer is already dirty, we're done.
*/
@@ -979,7 +989,7 @@
while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
drp = &dr->dr_next;
if (dr && dr->dr_txg == tx->tx_txg) {
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this buffer has already been written out,
* we now need to reset its state.
@@ -1020,7 +1030,7 @@
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* Update the accounting.
* Note: we delay "free accounting" until after we drop
@@ -1042,7 +1052,7 @@
void *data_old = db->db_buf;
if (db->db_state != DB_NOFILL) {
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
dbuf_fix_old_data(db, tx->tx_txg);
data_old = db->db.db_data;
} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
@@ -1078,7 +1088,8 @@
* and dbuf_dirty. We win, as though the dbuf_noread() had
* happened after the free.
*/
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
mutex_exit(&dn->dn_mtx);
@@ -1094,7 +1105,8 @@
mutex_exit(&db->db_mtx);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID ||
+ db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1182,7 +1194,7 @@
dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
/*
@@ -1297,7 +1309,7 @@
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(tx->tx_txg != 0);
ASSERT(db->db_level == 0);
ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1319,7 +1331,7 @@
if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
bzero(db->db.db_data, db->db.db_size);
@@ -1340,7 +1352,7 @@
{
ASSERT(!refcount_is_zero(&db->db_holds));
ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(db->db_level == 0);
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
ASSERT(buf != NULL);
@@ -1423,7 +1435,7 @@
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
}
@@ -1437,7 +1449,7 @@
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
- if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
dnode_rele(dn, db);
db->db_dnode = NULL;
@@ -1466,7 +1478,19 @@
*parentp = NULL;
*bpp = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
+
+ if (blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ *bpp = &dn->dn_phys->dn_spill;
+ else
+ *bpp = NULL;
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
if (dn->dn_phys->dn_nlevels == 0)
nlevels = 1;
@@ -1539,16 +1563,20 @@
db->db_immediate_evict = 0;
db->db_freed_in_flight = 0;
- if (blkid == DB_BONUS_BLKID) {
+ if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
db->db.db_size = DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
- db->db.db_offset = DB_BONUS_BLKID;
+ db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
+ } else if (blkid == DMU_SPILL_BLKID) {
+ db->db.db_size = (blkptr != NULL) ?
+ BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+ db->db.db_offset = 0;
} else {
int blocksize =
db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
@@ -1616,7 +1644,7 @@
{
ASSERT(refcount_is_zero(&db->db_holds));
- if (db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID) {
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
@@ -1652,7 +1680,7 @@
dmu_buf_impl_t *db = NULL;
blkptr_t *bp = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (dnode_block_freed(dn, blkid))
@@ -1708,7 +1736,7 @@
{
dmu_buf_impl_t *db, *parent = NULL;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, level);
@@ -1757,7 +1785,7 @@
* still referencing it from db_data, we need to make a copy
* of it in case we decide we want to dirty it again in this txg.
*/
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
@@ -1812,7 +1840,34 @@
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ return (ENOTSUP);
+ if (blksz == 0)
+ blksz = SPA_MINBLOCKSIZE;
+ if (blksz > SPA_MAXBLOCKSIZE)
+ blksz = SPA_MAXBLOCKSIZE;
+ else
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER);
+ dbuf_new_size(db, blksz, tx);
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
+ dnode_rm_spill(dn, tx);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1858,7 +1913,7 @@
dbuf_evict_user(db);
if (holds == 0) {
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
mutex_exit(&db->db_mtx);
dnode_rele(db->db_dnode, db);
} else if (db->db_buf == NULL) {
@@ -1971,6 +2026,11 @@
if (db->db_blkptr != NULL)
return;
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ db->db_blkptr = &dn->dn_phys->dn_spill;
+ BP_ZERO(db->db_blkptr);
+ return;
+ }
if (db->db_level == dn->dn_phys->dn_nlevels-1) {
/*
* This buffer was allocated at a time when there was
@@ -2071,13 +2131,19 @@
}
DBUF_VERIFY(db);
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ }
+
/*
* If this is a bonus buffer, simply copy the bonus data into the
* dnode. It will be written out when the dnode is synced (and it
* will be synced, since it must have been dirty for dbuf_sync to
* be called).
*/
- if (db->db_blkid == DB_BONUS_BLKID) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
ASSERT(*datap != NULL);
@@ -2204,14 +2270,27 @@
return;
}
- ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+ ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_type) ||
+ (db->db_blkid == DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_bonustype));
ASSERT(BP_GET_LEVEL(bp) == db->db_level);
mutex_enter(&db->db_mtx);
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ dnode_t *dn = db->db_dnode;
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ }
+#endif
+
if (db->db_level == 0) {
mutex_enter(&dn->dn_mtx);
- if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+ db->db_blkid != DMU_SPILL_BLKID)
dn->dn_phys->dn_maxblkid = db->db_blkid;
mutex_exit(&dn->dn_mtx);
@@ -2278,8 +2357,17 @@
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
+#ifdef ZFS_DEBUG
+ if (db->db_blkid == DMU_SPILL_BLKID) {
+ dnode_t *dn = db->db_dnode;
+ ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+ ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ db->db_blkptr == &dn->dn_phys->dn_spill);
+ }
+#endif
+
if (db->db_level == 0) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
if (db->db_state != DB_NOFILL) {
if (dr->dt.dl.dr_data != db->db_buf)
@@ -2362,6 +2450,7 @@
zbookmark_t zb;
zio_prop_t zp;
zio_t *zio;
+ int wp_flag = 0;
if (db->db_state != DB_NOFILL) {
if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
@@ -2385,9 +2474,12 @@
ASSERT(arc_released(parent->db_buf));
zio = parent->db_data_pending->dr_zio;
} else {
- ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+ db->db_blkid != DMU_SPILL_BLKID) ||
+ (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+ if (db->db_blkid != DMU_SPILL_BLKID)
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
zio = dn->dn_zio;
}
@@ -2399,8 +2491,11 @@
os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);
- dmu_write_policy(os, dn, db->db_level,
- db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
+ if (db->db_blkid == DMU_SPILL_BLKID)
+ wp_flag = WP_SPILL;
+ wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
ASSERT(db->db_state != DB_NOFILL);
--- a/usr/src/uts/common/fs/zfs/dmu.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c Tue Mar 16 09:43:38 2010 -0600
@@ -40,6 +40,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/sa.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <sys/zfs_znode.h>
@@ -90,7 +91,10 @@
{ zap_byteswap, TRUE, "snapshot refcount tags"},
{ zap_byteswap, TRUE, "DDT ZAP algorithm" },
{ zap_byteswap, TRUE, "DDT statistics" },
-};
+ { byteswap_uint8_array, TRUE, "System attributes" },
+ { zap_byteswap, TRUE, "SA master node" },
+ { zap_byteswap, TRUE, "SA attr registration" },
+ { zap_byteswap, TRUE, "SA attr layouts" }, };
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
@@ -142,6 +146,33 @@
return (0);
}
+int
+dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ if (type > DMU_OT_NUMTYPES)
+ return (EINVAL);
+
+ if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+ return (EINVAL);
+
+ dnode_setbonus_type(dn, type, tx);
+ return (0);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int error;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ dbuf_rm_spill(dn, tx);
+ dnode_rele(dn, FTAG);
+ return (error);
+}
+
/*
* returns ENOENT, EIO, or 0.
*/
@@ -179,6 +210,61 @@
}
/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+ dmu_buf_impl_t *db = NULL;
+ int err;
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&dn->dn_struct_rwlock);
+
+ ASSERT(db != NULL);
+ err = dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | flags);
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode;
+ int err;
+
+ if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA)
+ return (EINVAL);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (!dn->dn_have_spill) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ENOENT);
+ }
+ err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT, tag, dbp);
+ rw_exit(&dn->dn_struct_rwlock);
+ return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+ return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode,
+ 0, tag, dbp));
+}
+
+/*
* Note: longer-term, we should modify all of the dmu_buf_*() interfaces
* to take a held dnode rather than <os, object> -- the lookup is wasteful,
* and can induce severe lock contention when writing to several files
@@ -1349,7 +1435,7 @@
zp->zp_checksum = checksum;
zp->zp_compress = compress;
- zp->zp_type = type;
+ zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
@@ -1514,6 +1600,7 @@
arc_init();
l2arc_init();
xuio_stat_init();
+ sa_cache_init();
}
void
@@ -1525,4 +1612,5 @@
dbuf_fini();
l2arc_fini();
xuio_stat_fini();
+ sa_cache_fini();
}
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Mar 16 09:43:38 2010 -0600
@@ -41,6 +41,7 @@
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
#include <sys/sunddi.h>
+#include <sys/sa.h>
spa_t *
dmu_objset_spa(objset_t *os)
@@ -500,6 +501,9 @@
secondary_cache_changed_cb, os));
}
+ if (os->os_sa)
+ sa_tear_down(os);
+
/*
* We should need only a single pass over the dnode list, since
* nothing can be added to the list at this point.
@@ -1066,20 +1070,11 @@
}
static void
-do_userquota_callback(objset_t *os, dnode_phys_t *dnp,
- boolean_t subtract, dmu_tx_t *tx)
+do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
+ uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
{
- static const char zerobuf[DN_MAX_BONUSLEN] = {0};
- uint64_t user, group;
-
- ASSERT(dnp->dn_type != 0 ||
- (bcmp(DN_BONUS(dnp), zerobuf, DN_MAX_BONUSLEN) == 0 &&
- DN_USED_BYTES(dnp) == 0));
-
- if ((dnp->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) &&
- 0 == used_cbs[os->os_phys->os_type](dnp->dn_bonustype,
- DN_BONUS(dnp), &user, &group)) {
- int64_t delta = DNODE_SIZE + DN_USED_BYTES(dnp);
+ if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
+ int64_t delta = DNODE_SIZE + used;
if (subtract)
delta = -delta;
VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
@@ -1090,7 +1085,7 @@
}
void
-dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx)
+dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
{
dnode_t *dn;
list_t *list = &os->os_synced_dnodes;
@@ -1099,7 +1094,6 @@
while (dn = list_head(list)) {
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
- ASSERT(dn->dn_oldphys);
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED);
@@ -1116,20 +1110,39 @@
/*
* We intentionally modify the zap object even if the
- * net delta (due to phys-oldphys) is zero. Otherwise
+ * net delta is zero. Otherwise
* the block of the zap obj could be shared between
* datasets but need to be different between them after
* a bprewrite.
*/
- do_userquota_callback(os, dn->dn_oldphys, B_TRUE, tx);
- do_userquota_callback(os, dn->dn_phys, B_FALSE, tx);
/*
* The mutex is needed here for interlock with dnode_allocate.
*/
mutex_enter(&dn->dn_mtx);
- zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
- dn->dn_oldphys = NULL;
+ ASSERT(dn->dn_id_flags);
+ if (dn->dn_id_flags & DN_ID_OLD_EXIST) {
+ do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
+ dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+ }
+ if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+ do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
+ dn->dn_phys->dn_flags, dn->dn_newuid,
+ dn->dn_newgid, B_FALSE, tx);
+ }
+
+ dn->dn_oldused = 0;
+ dn->dn_oldflags = 0;
+ if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+ dn->dn_olduid = dn->dn_newuid;
+ dn->dn_oldgid = dn->dn_newgid;
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (dn->dn_bonuslen == 0)
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ else
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ dn->dn_id_flags &= ~(DN_ID_NEW_EXIST|DN_ID_SYNC);
mutex_exit(&dn->dn_mtx);
list_remove(list, dn);
@@ -1137,6 +1150,71 @@
}
}
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before)
+{
+ objset_t *os = dn->dn_objset;
+ void *data = NULL;
+ dmu_buf_t *spilldb = NULL;
+ uint64_t *user, *group;
+ int flags = dn->dn_id_flags;
+ int error;
+
+ if (!dmu_objset_userused_enabled(dn->dn_objset))
+ return;
+
+ if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+ DN_ID_CHKED_SPILL)))
+ return;
+
+ if (before && dn->dn_bonuslen != 0)
+ data = DN_BONUS(dn->dn_phys);
+ else if (!before && dn->dn_bonuslen != 0)
+ data = dn->dn_bonus != NULL ?
+ dn->dn_bonus->db.db_data : DN_BONUS(dn->dn_phys);
+ else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+ int rf = 0;
+
+ if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ error = dmu_spill_hold_by_dnode(dn, rf, FTAG, &spilldb);
+ ASSERT(error == 0);
+ data = spilldb->db_data;
+ } else {
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+
+ if (before) {
+ user = &dn->dn_olduid;
+ group = &dn->dn_oldgid;
+ } else {
+ user = &dn->dn_newuid;
+ group = &dn->dn_newgid;
+ }
+
+ ASSERT(data);
+ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
+ user, group);
+
+ mutex_enter(&dn->dn_mtx);
+ if (error == 0 && before)
+ dn->dn_id_flags |= DN_ID_OLD_EXIST;
+ if (error == 0 && !before)
+ dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+ if (spilldb) {
+ dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+ } else {
+ dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+ }
+ mutex_exit(&dn->dn_mtx);
+ if (spilldb)
+ dmu_buf_rele(spilldb, FTAG);
+}
+
boolean_t
dmu_objset_userspace_present(objset_t *os)
{
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -183,6 +183,31 @@
}
static int
+dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data)
+{
+ struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill);
+
+ if (ba->pending_op != PENDING_NONE) {
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+ return (EINTR);
+ ba->pending_op = PENDING_NONE;
+ }
+
+ /* write a SPILL record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_SPILL;
+ drrs->drr_object = object;
+ drrs->drr_length = blksz;
+ drrs->drr_toguid = ba->toguid;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ if (dump_bytes(ba, data, blksz))
+ return (EINTR);
+ return (0);
+}
+
+static int
dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
{
struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
@@ -319,6 +344,18 @@
break;
}
(void) arc_buf_remove_ref(abuf, &abuf);
+ } else if (type == DMU_OT_SA) {
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf;
+ int blksz = BP_GET_LSIZE(bp);
+
+ if (arc_read_nolock(NULL, spa, bp,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
+ return (EIO);
+
+ err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data);
+ (void) arc_buf_remove_ref(abuf, &abuf);
} else { /* it's a level-0 block of a regular object */
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
@@ -908,6 +945,11 @@
DO64(drr_free.drr_length);
DO64(drr_free.drr_toguid);
break;
+ case DRR_SPILL:
+ DO64(drr_spill.drr_object);
+ DO64(drr_spill.drr_length);
+ DO64(drr_spill.drr_toguid);
+ break;
case DRR_END:
DO64(drr_end.drr_checksum.zc_word[0]);
DO64(drr_end.drr_checksum.zc_word[1]);
@@ -969,8 +1011,9 @@
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen);
}
- if (err)
+ if (err) {
return (EINVAL);
+ }
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, drro->drr_object);
@@ -1121,6 +1164,56 @@
return (0);
}
+static int
+restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
+{
+ dmu_tx_t *tx;
+ void *data;
+ dmu_buf_t *db, *db_spill;
+ int err;
+
+ if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+ drrs->drr_length > SPA_MAXBLOCKSIZE)
+ return (EINVAL);
+
+ data = restore_read(ra, drrs->drr_length);
+ if (data == NULL)
+ return (ra->err);
+
+ if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
+ if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_spill(tx, db->db_object);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ dmu_buf_will_dirty(db_spill, tx);
+
+ if (db_spill->db_size < drrs->drr_length)
+ VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+ drrs->drr_length, tx));
+ bcopy(data, db_spill->db_data, drrs->drr_length);
+
+ dmu_buf_rele(db, FTAG);
+ dmu_buf_rele(db_spill, FTAG);
+
+ dmu_tx_commit(tx);
+ return (0);
+}
+
/* ARGSUSED */
static int
restore_free(struct restorearg *ra, objset_t *os,
@@ -1276,6 +1369,12 @@
ra.err = ECKSUM;
goto out;
}
+ case DRR_SPILL:
+ {
+ struct drr_spill drrs = drr->drr_u.drr_spill;
+ ra.err = restore_spill(&ra, os, &drrs);
+ break;
+ }
default:
ra.err = EINVAL;
goto out;
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Tue Mar 16 09:43:38 2010 -0600
@@ -33,6 +33,8 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/callb.h>
struct prefetch_data {
@@ -273,6 +275,17 @@
break;
lasterr = err;
}
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ SET_BOOKMARK(&czb, objset,
+ object, 0, DMU_SPILL_BLKID);
+ err = traverse_visitbp(td, dnp, buf,
+ (blkptr_t *)&dnp->dn_spill, &czb);
+ if (err) {
+ if (!hard)
+ break;
+ lasterr = err;
+ }
+ }
}
return (err != 0 ? err : lasterr);
}
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Mar 16 09:43:38 2010 -0600
@@ -33,7 +33,10 @@
#include <sys/dsl_pool.h>
#include <sys/zap_impl.h> /* for fzap_default_block_shift */
#include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
+#include <sys/varargs.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
@@ -813,10 +816,11 @@
match_offset = TRUE;
/*
* We will let this hold work for the bonus
- * buffer so that we don't need to hold it
- * when creating a new object.
+ * or spill buffer so that we don't need to
+ * hold it when creating a new object.
*/
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID ||
+ blkid == DMU_SPILL_BLKID)
match_offset = TRUE;
/*
* They might have to increase nlevels,
@@ -837,8 +841,12 @@
txh->txh_arg2 == DMU_OBJECT_END))
match_offset = TRUE;
break;
+ case THT_SPILL:
+ if (blkid == DMU_SPILL_BLKID)
+ match_offset = TRUE;
+ break;
case THT_BONUS:
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
match_offset = TRUE;
break;
case THT_ZAP:
@@ -1204,3 +1212,141 @@
kmem_free(dcb, sizeof (dmu_tx_callback_t));
}
}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed. If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+ int i;
+
+ if (!sa->sa_need_attr_registration)
+ return;
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (!sa->sa_attr_table[i].sa_registered) {
+ if (sa->sa_reg_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ else
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+ B_TRUE, sa->sa_attr_table[i].sa_name);
+ }
+ }
+}
+
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_hold_t *txh;
+ blkptr_t *bp;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+ THT_SPILL, 0, 0);
+
+ dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ return;
+
+ /* If blkptr doesn't exist then add space to towrite */
+ bp = &dn->dn_phys->dn_spill;
+ if (BP_IS_HOLE(bp)) {
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref = 0;
+ } else {
+ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ bp->blk_birth))
+ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ else
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ if (bp->blk_birth)
+ txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ }
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+ else {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+ return;
+
+ (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+ THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function. It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+ uint64_t object;
+ sa_os_t *sa = tx->tx_objset->os_sa;
+
+ ASSERT(hdl != NULL);
+
+ object = sa_handle_object(hdl);
+
+ dmu_tx_hold_bonus(tx, object);
+
+ if (tx->tx_objset->os_sa->sa_master_obj == 0)
+ return;
+
+ if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+ tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+ dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ }
+
+ dmu_tx_sa_registration_hold(sa, tx);
+
+ if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+ dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+ if (sa->sa_force_spill || may_grow || hdl->sa_spill ||
+ ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) {
+ ASSERT(tx->tx_txg == 0);
+ dmu_tx_hold_spill(tx, object);
+ }
+}
--- a/usr/src/uts/common/fs/zfs/dnode.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -210,6 +210,11 @@
ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
}
+
+ /* Swap SPILL block if we have one */
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+ byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+
}
void
@@ -258,6 +263,28 @@
rw_exit(&dn->dn_struct_rwlock);
}
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dn->dn_bonustype = newtype;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+ dn->dn_have_spill = B_FALSE;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
static void
dnode_setdblksz(dnode_t *dn, int size)
{
@@ -294,6 +321,7 @@
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
dn->dn_maxblkid = dnp->dn_maxblkid;
+ dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dmu_zfetch_init(&dn->dn_zfetch, dn);
@@ -321,7 +349,7 @@
}
ASSERT(NULL == list_head(&dn->dn_dbufs));
#endif
- ASSERT(dn->dn_oldphys == NULL);
+ ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
list_remove(&os->os_dnodes, dn);
@@ -368,6 +396,7 @@
ASSERT(ot != DMU_OT_NONE);
ASSERT3U(ot, <, DMU_OT_NUMTYPES);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
@@ -383,6 +412,8 @@
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+ ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -393,7 +424,11 @@
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
- dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+ dn->dn_nblkptr = 1;
+ else
+ dn->dn_nblkptr = 1 +
+ ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -407,10 +442,12 @@
}
dn->dn_allocated_txg = tx->tx_txg;
+ dn->dn_id_flags = 0;
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
@@ -446,8 +483,14 @@
if (dn->dn_bonuslen != bonuslen)
dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ if (dn->dn_bonustype != bonustype)
+ dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+ dn->dn_have_spill = B_FALSE;
+ }
rw_exit(&dn->dn_struct_rwlock);
/* change type */
@@ -627,11 +670,15 @@
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || dn->dn_oldphys))) {
+ (type != DMU_OT_NONE || (dn->dn_id_flags & DN_ID_SYNC)))) {
mutex_exit(&dn->dn_mtx);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
+ if (flag & DNODE_MUST_BE_FREE) {
+ ASSERT(refcount_is_zero(&dn->dn_holds));
+ ASSERT(!(dn->dn_id_flags & DN_ID_SYNC));
+ }
mutex_exit(&dn->dn_mtx);
if (refcount_add(&dn->dn_holds, tag) == 1)
@@ -706,6 +753,11 @@
mutex_exit(&dn->dn_mtx);
#endif
+ /*
+ * Determine old uid/gid when necessary
+ */
+ dmu_objset_userquota_get_ids(dn, B_TRUE);
+
mutex_enter(&os->os_lock);
/*
@@ -720,6 +772,7 @@
ASSERT(dn->dn_datablksz != 0);
ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+ ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
dn->dn_object, txg);
@@ -814,7 +867,8 @@
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
+ if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+ db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
}
@@ -858,7 +912,7 @@
int epbs, new_nlevels;
uint64_t sz;
- ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(have_read ?
RW_READ_HELD(&dn->dn_struct_rwlock) :
@@ -915,7 +969,8 @@
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
if (dr->dr_dbuf->db_level != new_nlevels-1 &&
- dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
@@ -1170,6 +1225,20 @@
rw_exit(&dn->dn_struct_rwlock);
}
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+ int i;
+
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
@@ -1178,7 +1247,7 @@
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
- if (blkid == DB_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
return (FALSE);
/*
@@ -1191,6 +1260,9 @@
if (dn->dn_free_txg)
return (TRUE);
+ if (blkid == DMU_SPILL_BLKID)
+ return (dnode_spill_freed(dn));
+
range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -434,7 +434,7 @@
db->db_last_dirty = NULL;
db->db_dirtycnt -= 1;
if (db->db_level == 0) {
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_data == db->db_buf);
dbuf_unoverride(dr);
}
@@ -490,6 +490,7 @@
dn->dn_maxblkid = 0;
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
+ dn->dn_have_spill = B_FALSE;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -512,6 +513,7 @@
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
static const dnode_phys_t zerodn = { 0 };
+ boolean_t kill_spill = B_FALSE;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -523,10 +525,13 @@
if (dmu_objset_userused_enabled(dn->dn_objset) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
- ASSERT(dn->dn_oldphys == NULL);
- dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
- *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+ dn->dn_oldflags = dn->dn_phys->dn_flags;
+ dn->dn_id_flags |= DN_ID_SYNC;
dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+ mutex_exit(&dn->dn_mtx);
+ dmu_objset_userquota_get_ids(dn, B_FALSE);
} else {
/* Once we account for it, we should always account for it. */
ASSERT(!(dn->dn_phys->dn_flags &
@@ -573,6 +578,24 @@
dn->dn_next_bonuslen[txgoff] = 0;
}
+ if (dn->dn_next_bonustype[txgoff]) {
+ ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES);
+ dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+ dn->dn_next_bonustype[txgoff] = 0;
+ }
+
+ /*
+ * We will either remove a spill block when a file is being removed
+ * or we have been asked to remove it.
+ */
+ if (dn->dn_rm_spillblk[txgoff] ||
+ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+ dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
+ if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ kill_spill = B_TRUE;
+ dn->dn_rm_spillblk[txgoff] = 0;
+ }
+
if (dn->dn_next_indblkshift[txgoff]) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -589,6 +612,21 @@
mutex_exit(&dn->dn_mtx);
+ if (kill_spill) {
+ dmu_buf_impl_t *spilldb;
+ (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+ mutex_enter(&dn->dn_mtx);
+ dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+ mutex_exit(&dn->dn_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ spilldb = dbuf_find(dn, 0, DMU_SPILL_BLKID);
+ if (spilldb) {
+ spilldb->db_blkptr = NULL;
+ mutex_exit(&spilldb->db_mtx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
/* process all the "freed" ranges in the file */
while (rp = avl_last(&dn->dn_ranges[txgoff])) {
dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Tue Mar 16 09:43:38 2010 -0600
@@ -343,7 +343,7 @@
for (ds = list_head(&dp->dp_synced_datasets); ds;
ds = list_next(&dp->dp_synced_datasets, ds))
- dmu_objset_do_userquota_callbacks(ds->ds_objset, tx);
+ dmu_objset_do_userquota_updates(ds->ds_objset, tx);
/*
* Sync the datasets again to push out the changes due to
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c Tue Mar 16 09:43:38 2010 -0600
@@ -42,6 +42,8 @@
#include <sys/zil_impl.h>
#include <sys/zio_checksum.h>
#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
@@ -612,6 +614,12 @@
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zbookmark_t czb;
+ SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+ scrub_visitbp(dp, dnp, buf, &dnp->dn_spill, &czb);
+ }
}
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sa.c Tue Mar 16 09:43:38 2010 -0600
@@ -0,0 +1,1887 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/dnode.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode. The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations. The spill block will be sized to fit the data
+ * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer. Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function. The names are only used
+ * during registration. All attributes are known by their unique attribute
+ * id value. If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually. Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout. The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is whats
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill). If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used. This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attribues. The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes. Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once. sa_replace_all_by_template() is
+ * used to set an array of attributes. This is used by the ZPL when
+ * creating a brand new file. The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing. The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping. Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available. Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+ uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+ void *data);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+ int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t *sa_bswap_table[] = {
+ byteswap_uint64_array,
+ byteswap_uint32_array,
+ byteswap_uint16_array,
+ byteswap_uint8_array,
+ zfs_acl_byteswap,
+};
+
+#define SA_COPY_DATA(f, s, t, l) \
+ { \
+ if (f == NULL) { \
+ if (l == 8) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ } else if (l == 16) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ *(uint64_t *)((uintptr_t)t + 8) = \
+ *(uint64_t *)((uintptr_t)s + 8); \
+ } else { \
+ bcopy(s, t, l); \
+ } \
+ } else \
+ sa_copy_data(f, s, t, l); \
+ }
+
+/*
+ * This table is fixed and cannot be changed. Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes. These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * ZPL legacy layout
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = 16;
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+ sa_handle_t *hdl = buf;
+
+ hdl->sa_bonus_tab = NULL;
+ hdl->sa_spill_tab = NULL;
+ hdl->sa_os = NULL;
+ hdl->sa_userp = NULL;
+ hdl->sa_bonus = NULL;
+ hdl->sa_spill = NULL;
+ mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+ sa_handle_t *hdl = buf;
+ mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+ sa_cache = kmem_cache_create("sa_cache",
+ sizeof (sa_handle_t), 0, sa_cache_constructor,
+ sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+ if (sa_cache)
+ kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = arg1;
+ const sa_lot_t *node2 = arg2;
+
+ if (node1->lot_num > node2->lot_num)
+ return (1);
+ else if (node1->lot_num < node2->lot_num)
+ return (-1);
+ return (0);
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+ const sa_lot_t *node1 = arg1;
+ const sa_lot_t *node2 = arg2;
+
+ if (node1->lot_hash > node2->lot_hash)
+ return (1);
+ if (node1->lot_hash < node2->lot_hash)
+ return (-1);
+ if (node1->lot_instance > node2->lot_instance)
+ return (1);
+ if (node1->lot_instance < node2->lot_instance)
+ return (-1);
+ return (0);
+}
+
+boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+ int i;
+
+ if (count != tbf->lot_attr_count)
+ return (1);
+
+ for (i = 0; i != count; i++) {
+ if (attrs[i] != tbf->lot_attrs[i])
+ return (1);
+ }
+ return (0);
+}
+
+#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+ int i;
+ uint64_t crc = -1ULL;
+
+ for (i = 0; i != attr_count; i++)
+ crc ^= SA_ATTR_HASH(attrs[i]);
+
+ return (crc);
+}
+
+static boolean_t
+sa_has_blkptr(sa_handle_t *hdl)
+{
+ int rc;
+ if (hdl->sa_spill == NULL) {
+ if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+ &hdl->sa_spill)) == 0)
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ } else {
+ rc = 0;
+ }
+
+ return (rc == 0 ? B_TRUE : B_FALSE);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ sa_data_op_t data_op, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int i;
+ int error = 0;
+ sa_buf_type_t buftypes;
+
+ buftypes = 0;
+
+ ASSERT(count > 0);
+ for (i = 0; i != count; i++) {
+ ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+ bulk[i].sa_addr = NULL;
+ /* First check the bonus buffer */
+
+ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+ hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+ SA_GET_HDR(hdl, SA_BONUS),
+ bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+ if (tx && !(buftypes & SA_BONUS)) {
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ buftypes |= SA_BONUS;
+ }
+ }
+ if (bulk[i].sa_addr == NULL && sa_has_blkptr(hdl)) {
+ if (TOC_ATTR_PRESENT(
+ hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+ SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+ SA_GET_HDR(hdl, SA_SPILL),
+ bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+ if (tx && !(buftypes & SA_SPILL) &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+ buftypes |= SA_SPILL;
+ }
+ }
+ }
+ switch (data_op) {
+ case SA_LOOKUP:
+ if (bulk[i].sa_addr == NULL)
+ return (ENOENT);
+ if (bulk[i].sa_data) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_addr, bulk[i].sa_data,
+ bulk[i].sa_size);
+ }
+ continue;
+
+ case SA_UPDATE:
+ /* existing rewrite of attr */
+ if (bulk[i].sa_addr &&
+ bulk[i].sa_size == bulk[i].sa_length) {
+ SA_COPY_DATA(bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_addr,
+ bulk[i].sa_length);
+ continue;
+ } else if (bulk[i].sa_addr) { /* attr size change */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_REPLACE, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ } else { /* adding new attribute */
+ error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+ SA_ADD, bulk[i].sa_data_func,
+ bulk[i].sa_data, bulk[i].sa_length, tx);
+ }
+ if (error)
+ return (error);
+ break;
+ }
+ }
+ return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+ uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, *findtb;
+ int i;
+ avl_index_t loc;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+ tb->lot_attr_count = attr_count;
+ tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+ tb->lot_num = lot_num;
+ tb->lot_hash = hash;
+ tb->lot_instance = 0;
+
+ if (zapadd) {
+ char attr_name[8];
+
+ if (sa->sa_layout_attr_obj == 0) {
+ int error;
+ sa->sa_layout_attr_obj = zap_create(os,
+ DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
+ error = zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
+ &sa->sa_layout_attr_obj, tx);
+ ASSERT3U(error, ==, 0);
+ }
+
+ (void) snprintf(attr_name, sizeof (attr_name),
+ "%d", (int)lot_num);
+ VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+ attr_name, 2, attr_count, attrs, tx));
+ }
+
+ list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+ offsetof(sa_idx_tab_t, sa_next));
+
+ for (i = 0; i != attr_count; i++) {
+ if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+ tb->lot_var_sizes++;
+ }
+
+ avl_add(&sa->sa_layout_num_tree, tb);
+
+ /* verify we don't have a hash collision */
+ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+ for (; findtb && findtb->lot_hash == hash;
+ findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+ if (findtb->lot_instance != tb->lot_instance)
+ break;
+ tb->lot_instance++;
+ }
+ }
+ avl_add(&sa->sa_layout_hash_tree, tb);
+ return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+ int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+ sa_lot_t *tb, tbsearch;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&sa->sa_lock);
+ tbsearch.lot_hash = hash;
+ tbsearch.lot_instance = 0;
+ tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+ if (tb) {
+ for (; tb && tb->lot_hash == hash;
+ tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+ if (sa_layout_equal(tb, attrs, count) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ tb = sa_add_layout_entry(os, attrs, count,
+ avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+ }
+ mutex_exit(&sa->sa_lock);
+ *lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+ int error;
+ uint32_t blocksize;
+
+ if (size == 0) {
+ blocksize = SPA_MINBLOCKSIZE;
+ } else if (size > SPA_MAXBLOCKSIZE) {
+ ASSERT(0);
+ return (EFBIG);
+ } else {
+ blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+ }
+
+ error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+ if (func == NULL) {
+ bcopy(datastart, target, buflen);
+ } else {
+ boolean_t start;
+ int bytes;
+ void *dataptr;
+ void *saptr = target;
+ uint32_t length;
+
+ start = B_TRUE;
+ bytes = 0;
+ while (bytes < buflen) {
+ func(&dataptr, &length, buflen, start, datastart);
+ bcopy(dataptr, saptr, length);
+ saptr = (void *)((caddr_t)saptr + length);
+ bytes += length;
+ start = B_FALSE;
+ }
+ }
+}
+
+/*
+ * Determine several different sizes
+ * first the sa header size
+ * the number of bytes to be stored
+ * if spill would occur the index in the attribute array is returned
+ *
+ * the boolean will_spill will be set when spilling is necessary. It
+ * is only set when the buftype is SA_BONUS
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
+ boolean_t *will_spill)
+{
+ int var_size = 0;
+ int i;
+ int full_space;
+ int hdrsize;
+ boolean_t done = B_FALSE;
+
+ if (buftype == SA_BONUS && sa->sa_force_spill) {
+ *total = 0;
+ *index = 0;
+ *will_spill = B_TRUE;
+ return (0);
+ }
+
+ *index = -1;
+ *total = 0;
+
+ if (buftype == SA_BONUS)
+ *will_spill = B_FALSE;
+
+ hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+ sizeof (sa_hdr_phys_t);
+
+ full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
+
+ for (i = 0; i != attr_count; i++) {
+ boolean_t is_var_sz;
+
+ *total += attr_desc[i].sa_length;
+ if (done)
+ goto next;
+
+ is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+ if (is_var_sz) {
+ var_size++;
+ }
+
+ if (is_var_sz && var_size > 1) {
+ if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+ *total < full_space) {
+ hdrsize += sizeof (uint16_t);
+ } else {
+ done = B_TRUE;
+ *index = i;
+ if (buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ continue;
+ }
+ }
+
+ /*
+ * find index of where spill *could* occur.
+ * Then continue to count of remainder attribute
+ * space. The sum is used later for sizing bonus
+ * and spill buffer.
+ */
+ if (buftype == SA_BONUS && *index == -1 &&
+ P2ROUNDUP(*total + hdrsize, 8) >
+ (full_space - sizeof (blkptr_t))) {
+ *index = i;
+ done = B_TRUE;
+ }
+
+next:
+ if (P2ROUNDUP(*total + hdrsize, 8) > full_space &&
+ buftype == SA_BONUS)
+ *will_spill = B_TRUE;
+ }
+
+ hdrsize = P2ROUNDUP(hdrsize, 8);
+ return (hdrsize);
+}
+
+#define BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+ dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ uint64_t hash;
+ sa_buf_type_t buftype;
+ sa_hdr_phys_t *sahdr;
+ void *data_start;
+ int buf_space;
+ sa_attr_type_t *attrs, *attrs_start;
+ int i, lot_count;
+ int hdrsize, spillhdrsize;
+ int used;
+ dmu_object_type_t bonustype;
+ sa_lot_t *lot;
+ int len_idx;
+ int spill_used;
+ boolean_t spilling;
+
+ dmu_buf_will_dirty(hdl->sa_bonus, tx);
+ bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+
+ /* first determine bonus header size and sum of all attributes */
+ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+ SA_BONUS, &i, &used, &spilling);
+
+ if (used > SPA_MAXBLOCKSIZE)
+ return (EFBIG);
+
+ VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
+ MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+ used + hdrsize, tx));
+
+ ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+ bonustype == DMU_OT_SA);
+
+ /* setup and size spill buffer when needed */
+ if (spilling) {
+ boolean_t dummy;
+
+ if (hdl->sa_spill == NULL) {
+ int error;
+ error = dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+ &hdl->sa_spill);
+ ASSERT3U(error, ==, 0);
+ }
+ dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+ spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
+ attr_count - i, hdl->sa_spill, SA_SPILL, &i,
+ &spill_used, &dummy);
+
+ if (spill_used > SPA_MAXBLOCKSIZE)
+ return (EFBIG);
+
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+ hdl->sa_spill->db_size)
+ VERIFY(0 == sa_resize_spill(hdl,
+ BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+ }
+
+ /* setup starting pointers to lay down data */
+ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+ sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+ buftype = SA_BONUS;
+
+ if (spilling)
+ buf_space = (sa->sa_force_spill) ?
+ 0 : SA_BLKPTR_SPACE - hdrsize;
+ else
+ buf_space = hdl->sa_bonus->db_size - hdrsize;
+
+ attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+ KM_SLEEP);
+ lot_count = 0;
+
+ for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+ uint16_t length;
+
+ attrs[i] = attr_desc[i].sa_attr;
+ length = SA_REGISTERED_LEN(sa, attrs[i]);
+ if (length == 0)
+ length = attr_desc[i].sa_length;
+
+ if (buf_space < length) { /* switch to spill buffer */
+ ASSERT(bonustype != DMU_OT_ZNODE);
+ if (buftype == SA_BONUS && !sa->sa_force_spill) {
+ sa_find_layout(hdl->sa_os, hash, attrs_start,
+ lot_count, tx, &lot);
+ SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+ }
+
+ buftype = SA_SPILL;
+ hash = -1ULL;
+ len_idx = 0;
+
+ sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+ sahdr->sa_magic = SA_MAGIC;
+ data_start = (void *)((uintptr_t)sahdr +
+ spillhdrsize);
+ attrs_start = &attrs[i];
+ buf_space = hdl->sa_spill->db_size - spillhdrsize;
+ lot_count = 0;
+ }
+ hash ^= SA_ATTR_HASH(attrs[i]);
+ attr_desc[i].sa_addr = data_start;
+ attr_desc[i].sa_size = length;
+ SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+ data_start, length);
+ if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+ sahdr->sa_lengths[len_idx++] = length;
+ }
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ length), 8);
+ buf_space -= P2ROUNDUP(length, 8);
+ lot_count++;
+ }
+
+ sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+ if (bonustype == DMU_OT_SA) {
+ SA_SET_HDR(sahdr, lot->lot_num,
+ buftype == SA_BONUS ? hdrsize : spillhdrsize);
+ }
+
+ kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (!sa->sa_force_spill)
+ VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+ if (hdl->sa_spill) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ if (!spilling) {
+ /*
+ * remove spill block that is no longer needed.
+ * set sa_spill_remove to prevent sa_attr_op
+ * from trying to retrieve spill block before its
+ * been removed. The flag will be cleared if/when
+ * the handle is destroyed recreated or
+ * sa_build_layouts() needs to spill again.
+ */
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ hdl->sa_spill_tab = NULL;
+ VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+ sa_handle_object(hdl), tx));
+ } else {
+ VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+ }
+ }
+
+ return (0);
+}
+
+static void
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+ sa_os_t *sa = os->os_sa;
+ uint64_t sa_attr_count = 0;
+ int error = 0;
+ uint64_t attr_value;
+ sa_attr_table_t *tb;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int registered_count = 0;
+ int i;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+
+ sa->sa_user_table =
+ kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+ sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+ if (sa->sa_reg_attr_obj != 0)
+ VERIFY(zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count) == 0);
+
+ if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+ sa_attr_count += sa_legacy_attr_count;
+
+ /* Allocate attribute numbers for attributes that aren't registered */
+ for (i = 0; i != count; i++) {
+ boolean_t found = B_FALSE;
+ int j;
+
+ if (ostype == DMU_OST_ZFS) {
+ for (j = 0; j != sa_legacy_attr_count; j++) {
+ if (strcmp(reg_attrs[i].sa_name,
+ sa_legacy_attrs[j].sa_name) == 0) {
+ sa->sa_user_table[i] =
+ sa_legacy_attrs[j].sa_attr;
+ found = B_TRUE;
+ }
+ }
+ }
+ if (found)
+ continue;
+
+ if (sa->sa_reg_attr_obj)
+ error = zap_lookup(os, sa->sa_reg_attr_obj,
+ reg_attrs[i].sa_name, 8, 1, &attr_value);
+ else
+ error = ENOENT;
+ switch (error) {
+ default:
+ case ENOENT:
+ sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+ sa_attr_count++;
+ break;
+ case 0:
+ sa->sa_user_table[i] = ATTR_NUM(attr_value);
+ break;
+ }
+ }
+
+ os->os_sa->sa_num_attrs = sa_attr_count;
+ tb = os->os_sa->sa_attr_table =
+ kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+ /*
+ * Attribute table is constructed from requested attribute list,
+ * previously foreign registered attributes, and also the legacy
+ * ZPL set of attributes.
+ */
+
+ if (sa->sa_reg_attr_obj) {
+ for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t value;
+ value = za.za_first_integer;
+
+ registered_count++;
+ tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+ tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+ tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+ tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+ if (tb[ATTR_NUM(value)].sa_name) {
+ continue;
+ }
+ tb[ATTR_NUM(value)].sa_name =
+ kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+ (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+ strlen(za.za_name) +1);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ if (ostype == DMU_OST_ZFS) {
+ for (i = 0; i != sa_legacy_attr_count; i++) {
+ if (tb[i].sa_name)
+ continue;
+ tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+ tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+ tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+ tb[i].sa_registered = B_FALSE;
+ tb[i].sa_name =
+ kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+ KM_SLEEP);
+ (void) strlcpy(tb[i].sa_name,
+ sa_legacy_attrs[i].sa_name,
+ strlen(sa_legacy_attrs[i].sa_name) + 1);
+ }
+ }
+
+ for (i = 0; i != count; i++) {
+ sa_attr_type_t attr_id;
+
+ attr_id = sa->sa_user_table[i];
+ if (tb[attr_id].sa_name)
+ continue;
+
+ tb[attr_id].sa_length = reg_attrs[i].sa_length;
+ tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+ tb[attr_id].sa_attr = attr_id;
+ tb[attr_id].sa_name =
+ kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+ (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+ strlen(reg_attrs[i].sa_name) + 1);
+ }
+
+ os->os_sa->sa_need_attr_registration =
+ (sa_attr_count != registered_count);
+}
+
+sa_attr_type_t *
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ sa_os_t *sa;
+ dmu_objset_type_t ostype = dmu_objset_type(os);
+ sa_attr_type_t *tb;
+
+ mutex_enter(&os->os_lock);
+ if (os->os_sa) {
+ mutex_enter(&os->os_sa->sa_lock);
+ mutex_exit(&os->os_lock);
+ tb = os->os_sa->sa_user_table;
+ mutex_exit(&os->os_sa->sa_lock);
+ return (tb);
+ }
+
+ sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+ mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+ sa->sa_master_obj = sa_obj;
+
+ mutex_enter(&sa->sa_lock);
+ mutex_exit(&os->os_lock);
+ avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+ avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+ sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+ if (sa_obj) {
+ int error;
+ error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+ 8, 1, &sa->sa_layout_attr_obj);
+ if (error != 0 && error != ENOENT) {
+ return (NULL);
+ }
+ error = zap_lookup(os, sa_obj, SA_REGISTRY,
+ 8, 1, &sa->sa_reg_attr_obj);
+ if (error != 0 && error != ENOENT) {
+ mutex_exit(&sa->sa_lock);
+ return (NULL);
+ }
+ }
+
+ os->os_sa = sa;
+ sa_attr_table_setup(os, reg_attrs, count);
+
+ if (sa->sa_layout_attr_obj != 0) {
+ for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ sa_attr_type_t *lot_attrs;
+ uint64_t lot_num;
+
+ lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+ za.za_num_integers, KM_SLEEP);
+
+ VERIFY(zap_lookup(os, sa->sa_layout_attr_obj,
+ za.za_name, 2, za.za_num_integers, lot_attrs) == 0);
+ VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+ (unsigned long long *)&lot_num) == 0);
+
+ (void) sa_add_layout_entry(os, lot_attrs,
+ za.za_num_integers, lot_num,
+ sa_layout_info_hash(lot_attrs,
+ za.za_num_integers), B_FALSE, NULL);
+ kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+ za.za_num_integers);
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ /* Add special layout number for old ZNODES */
+ if (ostype == DMU_OST_ZFS) {
+ (void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+ sa_legacy_attr_count, 0,
+ sa_layout_info_hash(sa_legacy_zpl_layout,
+ sa_legacy_attr_count), B_FALSE, NULL);
+
+ (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+ 0, B_FALSE, NULL);
+ }
+ mutex_exit(&sa->sa_lock);
+ return (os->os_sa->sa_user_table);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *layout;
+ void *cookie;
+ int i;
+
+ kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+ /* Free up attr table */
+
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_name)
+ kmem_free(sa->sa_attr_table[i].sa_name,
+ strlen(sa->sa_attr_table[i].sa_name) + 1);
+ }
+
+ kmem_free(sa->sa_attr_table,
+ sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
+ sa_idx_tab_t *tab;
+ while (tab = list_head(&layout->lot_idx_tab)) {
+ ASSERT(refcount_count(&tab->sa_refcount));
+ sa_idx_tab_rele(os, tab);
+ }
+ }
+
+ cookie = NULL;
+ while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
+ kmem_free(layout->lot_attrs,
+ sizeof (sa_attr_type_t) * layout->lot_attr_count);
+ kmem_free(layout, sizeof (sa_lot_t));
+ }
+
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+
+ kmem_free(sa, sizeof (sa_os_t));
+ os->os_sa = NULL;
+}
+
+void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+ sa_idx_tab_t *idx_tab = userp;
+
+ if (var_length) {
+ ASSERT(idx_tab->sa_variable_lengths);
+ idx_tab->sa_variable_lengths[length_idx] = length;
+ }
+ TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+ (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+ sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+ void *data_start;
+ sa_lot_t *tb = tab;
+ sa_lot_t search;
+ avl_index_t loc;
+ sa_os_t *sa = os->os_sa;
+ int i;
+ uint16_t *length_start;
+ uint8_t length_idx = 0;
+
+ if (tab == NULL) {
+ search.lot_num = SA_LAYOUT_NUM(hdr, type);
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+ ASSERT(tb);
+ }
+
+ if (IS_SA_BONUSTYPE(type)) {
+ data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+ offsetof(sa_hdr_phys_t, sa_lengths) +
+ (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+ length_start = hdr->sa_lengths;
+ } else {
+ data_start = hdr;
+ }
+
+ for (i = 0; i != tb->lot_attr_count; i++) {
+ int attr_length, reg_length;
+ uint8_t idx_len;
+
+ reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+ if (reg_length) {
+ attr_length = reg_length;
+ idx_len = 0;
+ } else {
+ attr_length = length_start[length_idx];
+ idx_len = length_idx++;
+ }
+
+ func(hdr, data_start, tb->lot_attrs[i], attr_length,
+ idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+ data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+ attr_length), 8);
+ }
+}
+
+/*ARGSUSED*/
+void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+ uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+ sa_handle_t *hdl = userp;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+ dmu_buf_impl_t *db;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ int num_lengths = 1;
+ int i;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ if (sa_hdr_phys->sa_magic == SA_MAGIC)
+ return;
+
+ db = SA_GET_DB(hdl, buftype);
+
+ if (buftype == SA_SPILL) {
+ arc_release(db->db_buf, NULL);
+ arc_buf_thaw(db->db_buf);
+ }
+
+ sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+ sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+ /*
+ * Determine number of variable lenghts in header
+ * The standard 8 byte header has one for free and a
+ * 16 byte header would have 4 + 1;
+ */
+ if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+ num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+ for (i = 0; i != num_lengths; i++)
+ sa_hdr_phys->sa_lengths[i] =
+ BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+ sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+ sa_byteswap_cb, NULL, hdl);
+
+ if (buftype == SA_SPILL)
+ arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+ sa_hdr_phys_t *sa_hdr_phys;
+ dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+ dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_idx_tab_t *idx_tab;
+
+ sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+ mutex_enter(&sa->sa_lock);
+
+ /* Do we need to byteswap? */
+
+ /* only check if not old znode */
+ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+ sa_hdr_phys->sa_magic != 0) {
+ VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
+ sa_byteswap(hdl, buftype);
+ }
+
+ idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+ if (buftype == SA_BONUS)
+ hdl->sa_bonus_tab = idx_tab;
+ else
+ hdl->sa_spill_tab = idx_tab;
+
+ mutex_exit(&sa->sa_lock);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+sa_evict(dmu_buf_t *db, void *sap)
+{
+ panic("evicting sa dbuf %p\n", (void *)db);
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+ sa_os_t *sa = os->os_sa;
+ sa_idx_tab_t *idx_tab = arg;
+
+ if (idx_tab == NULL)
+ return;
+
+ mutex_enter(&sa->sa_lock);
+ if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+ list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+ if (idx_tab->sa_variable_lengths)
+ kmem_free(idx_tab->sa_variable_lengths,
+ sizeof (uint16_t) *
+ idx_tab->sa_layout->lot_var_sizes);
+ refcount_destroy(&idx_tab->sa_refcount);
+ kmem_free(idx_tab->sa_idx_tab,
+ sizeof (uint32_t) * sa->sa_num_attrs);
+ kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+ }
+ mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+ sa_os_t *sa = os->os_sa;
+
+ ASSERT(MUTEX_HELD(&sa->sa_lock));
+ (void) refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+ mutex_enter(&hdl->sa_lock);
+ (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
+ NULL, NULL, NULL);
+
+ if (hdl->sa_bonus_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+ hdl->sa_bonus_tab = NULL;
+ }
+ if (hdl->sa_spill_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ hdl->sa_spill_tab = NULL;
+ }
+
+ dmu_buf_rele(hdl->sa_bonus, NULL);
+
+ if (hdl->sa_spill)
+ dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+ mutex_exit(&hdl->sa_lock);
+
+ kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ int error = 0;
+ dmu_object_info_t doi;
+ sa_handle_t *handle;
+
+#ifdef ZFS_DEBUG
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+ doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+ /* find handle, if it exists */
+ /* if one doesn't exist then create a new one, and initialize it */
+
+ handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
+ if (handle == NULL) {
+ sa_handle_t *newhandle;
+ handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+ handle->sa_userp = userp;
+ handle->sa_bonus = db;
+ handle->sa_os = os;
+ handle->sa_spill = NULL;
+
+ error = sa_build_index(handle, SA_BONUS);
+ newhandle = (hdl_type == SA_HDL_SHARED) ?
+ dmu_buf_set_user_ie(db, handle,
+ NULL, sa_evict) : NULL;
+
+ if (newhandle != NULL) {
+ kmem_cache_free(sa_cache, handle);
+ handle = newhandle;
+ }
+ }
+ *handlepp = handle;
+
+ return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+ sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+ dmu_buf_t *db;
+ int error;
+
+ if (error = dmu_bonus_hold(objset, objid, NULL, &db))
+ return (error);
+
+ return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+ handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+ return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dmu_buf_rele(db, tag);
+}
+
+int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = buf;
+ bulk.sa_length = buflen;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_lookup_impl(hdl, &bulk, 1);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+
+ mutex_enter(&hdl->sa_lock);
+ if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL) == 0) {
+ error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+ uio->uio_resid), UIO_READ, uio);
+ } else {
+ error = ENOENT;
+ }
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+
+}
+#endif
+
+/*
+ * Find an already existing TOC from given os and data
+ * This is a special interface to be used by the ZPL for
+ * finding the uid/gid/gen attributes.
+ */
+void *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
+{
+ sa_idx_tab_t *idx_tab;
+ sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
+ sa_os_t *sa = os->os_sa;
+ sa_lot_t *tb, search;
+ avl_index_t loc;
+
+ /*
+ * Deterimine layout number. If SA node and header == 0 then
+ * force the index table to the dummy "1" empty layout.
+ *
+ * The layout number would only be zero for a newly created file
+ * that has not added any attributes yet, or with crypto enabled which
+ * doesn't write any attributes to the bonus buffer.
+ */
+
+ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+ tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+ /* Verify header size is consistent with layout information */
+ ASSERT(tb);
+ ASSERT(IS_SA_BONUSTYPE(bonustype) &&
+ SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
+ (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+ /*
+ * See if any of the already existing TOC entries can be reused?
+ */
+
+ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+ idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+ boolean_t valid_idx = B_TRUE;
+ int i;
+
+ if (tb->lot_var_sizes != 0 &&
+ idx_tab->sa_variable_lengths != NULL) {
+ for (i = 0; i != tb->lot_var_sizes; i++) {
+ if (hdr->sa_lengths[i] !=
+ idx_tab->sa_variable_lengths[i]) {
+ valid_idx = B_FALSE;
+ break;
+ }
+ }
+ }
+ if (valid_idx) {
+ sa_idx_tab_hold(os, idx_tab);
+ return (idx_tab);
+ }
+ }
+
+ /* No such luck, create a new entry */
+ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+ idx_tab->sa_idx_tab =
+ kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+ idx_tab->sa_layout = tb;
+ refcount_create(&idx_tab->sa_refcount);
+ if (tb->lot_var_sizes)
+ idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+ tb->lot_var_sizes, KM_SLEEP);
+
+ sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+ tb, idx_tab);
+ sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */
+ sa_idx_tab_hold(os, idx_tab); /* one for layout */
+ list_insert_tail(&tb->lot_idx_tab, idx_tab);
+ return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+ boolean_t start, void *userdata)
+{
+ ASSERT(start);
+
+ *dataptr = userdata;
+ *len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ uint64_t attr_value = 0;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_attr_table_t *tb = sa->sa_attr_table;
+ int i;
+
+ mutex_enter(&sa->sa_lock);
+
+ if (!sa->sa_need_attr_registration || sa->sa_master_obj == NULL) {
+ mutex_exit(&sa->sa_lock);
+ return;
+ }
+
+ if (sa->sa_reg_attr_obj == NULL) {
+ int error;
+ sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
+ DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
+ error = zap_add(hdl->sa_os, sa->sa_master_obj,
+ SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx);
+ ASSERT(error == 0);
+ }
+ for (i = 0; i != sa->sa_num_attrs; i++) {
+ if (sa->sa_attr_table[i].sa_registered)
+ continue;
+ ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+ tb[i].sa_byteswap);
+ VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+ tb[i].sa_name, 8, 1, &attr_value, tx));
+ tb[i].sa_registered = B_TRUE;
+ }
+ sa->sa_need_attr_registration = B_FALSE;
+ mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file. It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+ return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+ int attr_count, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_replace_all_by_template_locked(hdl, attr_desc,
+ attr_count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * add/remove/replace a single attribute and then rewrite the entire set
+ * of attributes.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+ sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+ uint16_t buflen, dmu_tx_t *tx)
+{
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ sa_bulk_attr_t *attr_desc;
+ void *old_data[2];
+ int bonus_attr_count = 0;
+ int bonus_data_size, spill_data_size;
+ int spill_attr_count = 0;
+ int error;
+ uint16_t length;
+ int i, j, k, length_idx;
+ sa_hdr_phys_t *hdr;
+ sa_idx_tab_t *idx_tab;
+ int attr_count;
+ int count;
+
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* First make of copy of the old data */
+
+ if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) {
+ bonus_data_size = hdl->sa_bonus->db_size;
+ old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+ bcopy(hdl->sa_bonus->db_data, old_data[0],
+ hdl->sa_bonus->db_size);
+ bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+ } else {
+ old_data[0] = NULL;
+ }
+
+ /* Bring spill buffer online if it isn't currently */
+
+ if (sa_has_blkptr(hdl)) {
+ spill_data_size = hdl->sa_spill->db_size;
+ old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
+ bcopy(hdl->sa_spill->db_data, old_data[1],
+ hdl->sa_spill->db_size);
+ spill_attr_count =
+ hdl->sa_spill_tab->sa_layout->lot_attr_count;
+ } else {
+ old_data[1] = NULL;
+ }
+
+ /* build descriptor of all attributes */
+
+ attr_count = bonus_attr_count + spill_attr_count;
+ if (action == SA_ADD)
+ attr_count++;
+ else if (action == SA_REMOVE)
+ attr_count--;
+
+ attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+ /*
+ * loop through bonus and spill buffer if it exists, and
+ * build up new attr_descriptor to reset the attributes
+ */
+ k = j = 0;
+ count = bonus_attr_count;
+ hdr = SA_GET_HDR(hdl, SA_BONUS);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+ for (; k != 2; k++) {
+ /* iterate over each attribute in layout */
+ for (i = 0, length_idx = 0; i != count; i++) {
+ sa_attr_type_t attr;
+
+ attr = idx_tab->sa_layout->lot_attrs[i];
+ if (attr == newattr) {
+ if (action == SA_REMOVE) {
+ j++;
+ continue;
+ }
+ ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
+ ASSERT(action == SA_REPLACE);
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ locator, datastart, buflen);
+ } else {
+ length = SA_REGISTERED_LEN(sa, attr);
+ if (length == 0) {
+ length = hdr->sa_lengths[length_idx++];
+ }
+
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ NULL, (void *)
+ (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+ (uintptr_t)old_data[k]), length);
+ }
+ }
+ if (k == 0 && hdl->sa_spill) {
+ hdr = SA_GET_HDR(hdl, SA_SPILL);
+ idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+ count = spill_attr_count;
+ } else {
+ break;
+ }
+ }
+ if (action == SA_ADD) {
+ length = SA_REGISTERED_LEN(sa, newattr);
+ if (length == 0) {
+ length = buflen;
+ }
+ SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+ datastart, buflen);
+ }
+
+ error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+ if (old_data[0])
+ kmem_free(old_data[0], bonus_data_size);
+ if (old_data[1])
+ kmem_free(old_data[1], spill_data_size);
+ kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+ return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+ dmu_tx_t *tx)
+{
+ int error;
+ sa_os_t *sa = hdl->sa_os->os_sa;
+ dmu_object_type_t bonustype;
+
+ bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+ /* sync out registration table if necessary */
+ if (sa->sa_need_attr_registration)
+ sa_attr_register_sync(hdl, tx);
+
+ error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+ if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+ sa->sa_update_cb(hdl, tx);
+
+ return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+ void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = type;
+ bulk.sa_data_func = NULL;
+ bulk.sa_length = buflen;
+ bulk.sa_data = buf;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
+ uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
+{
+ int error;
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_attr = attr;
+ bulk.sa_data = userdata;
+ bulk.sa_data_func = locator;
+ bulk.sa_length = buflen;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+ sa_bulk_attr_t bulk;
+
+ bulk.sa_data = NULL;
+ bulk.sa_attr = attr;
+ bulk.sa_data_func = NULL;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) {
+ mutex_exit(&hdl->sa_lock);
+ return (ENOENT);
+ }
+ *size = bulk.sa_size;
+
+ mutex_exit(&hdl->sa_lock);
+ return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ ASSERT(hdl);
+ ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_lookup_locked(hdl, attrs, count);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+ int error;
+
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+ error = sa_bulk_update_impl(hdl, attrs, count, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+ int error;
+
+ mutex_enter(&hdl->sa_lock);
+ error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+ NULL, 0, tx);
+ mutex_exit(&hdl->sa_lock);
+ return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+ dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
+ blksize, nblocks);
+}
+
+void
+sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
+{
+ (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
+ oldhdl, newhdl, NULL, sa_evict);
+ oldhdl->sa_bonus = NULL;
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+ hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+ return ((dmu_buf_t *)hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+ return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+ ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+ os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+ mutex_enter(&os->os_sa->sa_lock);
+ sa_register_update_callback_locked(os, func);
+ mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+ return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+ return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+ sa_os_t *sa = os->os_sa;
+
+ if (sa->sa_master_obj)
+ return (1);
+
+ sa->sa_master_obj = sa_object;
+
+ return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+ sa_hdr_phys_t *hdr = arg;
+
+ return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+ ASSERT(hdl);
+ mutex_exit(&hdl->sa_lock);
+}
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Mar 16 09:43:38 2010 -0600
@@ -38,7 +38,6 @@
extern "C" {
#endif
-#define DB_BONUS_BLKID (-1ULL)
#define IN_DMU_SYNC 2
/*
@@ -242,6 +241,10 @@
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Mar 16 09:43:38 2010 -0600
@@ -63,6 +63,7 @@
struct nvlist;
struct arc_buf;
struct zio_prop;
+struct sa_handle;
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
@@ -122,6 +123,10 @@
DMU_OT_USERREFS, /* ZAP */
DMU_OT_DDT_ZAP, /* ZAP */
DMU_OT_DDT_STATS, /* ZAP */
+ DMU_OT_SA, /* System attr */
+ DMU_OT_SA_MASTER_NODE, /* ZAP */
+ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
+ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -159,6 +164,11 @@
#define DMU_DEADLIST_OBJECT (-3ULL)
/*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define DMU_BONUS_BLKID (-1ULL)
+#define DMU_SPILL_BLKID (-2ULL)
+/*
* Public routines to create, destroy, open, and close objsets.
*/
int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
@@ -314,6 +324,7 @@
*/
#define WP_NOFILL 0x1
#define WP_DMU_SYNC 0x2
+#define WP_SPILL 0x4
void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
struct zio_prop *zp);
@@ -330,6 +341,17 @@
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+ void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
/*
* Obtain the DMU buffer from the specified object which contains the
@@ -443,6 +465,9 @@
uint64_t len);
void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@
#include <sys/dnode.h>
#include <sys/zio.h>
#include <sys/zil.h>
+#include <sys/sa.h>
#ifdef __cplusplus
extern "C" {
@@ -99,6 +100,9 @@
/* stuff we store for the user */
kmutex_t os_user_ptr_lock;
void *os_user_ptr;
+
+ /* SA layout/attribute registration */
+ sa_os_t *os_sa;
};
#define DMU_META_OBJSET 0
@@ -146,7 +150,8 @@
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
objset_t **osp);
void dmu_objset_evict(objset_t *os);
-void dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before);
boolean_t dmu_objset_userused_enabled(objset_t *os);
int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -77,6 +77,7 @@
THT_FREE,
THT_ZAP,
THT_SPACE,
+ THT_SPILL,
THT_NUMTYPES
};
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Mar 16 09:43:38 2010 -0600
@@ -63,6 +63,19 @@
#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
/*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define DN_ID_CHKED_BONUS 0x1
+#define DN_ID_CHKED_SPILL 0x2
+#define DN_ID_OLD_EXIST 0x4
+#define DN_ID_NEW_EXIST 0x8
+#define DN_ID_SYNC 0x10
+
+/*
* Derived constants.
*/
#define DNODE_SIZE (1 << DNODE_SHIFT)
@@ -70,6 +83,7 @@
#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
+#define DN_KILL_SPILLBLK (1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
@@ -102,6 +116,9 @@
#define DNODE_FLAG_USED_BYTES (1<<0)
#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
+/* Does dnode have a SA spill blkptr in bonus? */
+#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
uint8_t dn_indblkshift; /* ln2(indirect block size) */
@@ -122,7 +139,8 @@
uint64_t dn_pad3[4];
blkptr_t dn_blkptr[1];
- uint8_t dn_bonus[DN_MAX_BONUSLEN];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+ blkptr_t dn_spill;
} dnode_phys_t;
typedef struct dnode {
@@ -162,6 +180,8 @@
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint8_t dn_next_bonustype[TXG_SIZE];
+ uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */
uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
@@ -186,12 +206,17 @@
kmutex_t dn_dbufs_mtx;
list_t dn_dbufs; /* linked list of descendent dbuf_t's */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+ boolean_t dn_have_spill; /* have spill or are spilling */
/* parent IO for current sync write */
zio_t *dn_zio;
/* used in syncing context */
- dnode_phys_t *dn_oldphys;
+ uint64_t dn_oldused; /* old phys used bytes */
+ uint64_t dn_oldflags; /* old phys dn_flags */
+ uint64_t dn_olduid, dn_oldgid;
+ uint64_t dn_newuid, dn_newgid;
+ int dn_id_flags;
/* holds prefetch structure */
struct zfetch dn_zfetch;
@@ -208,6 +233,9 @@
void dnode_special_close(dnode_t *dn);
void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
int dnode_hold(struct objset *dd, uint64_t object,
void *ref, dnode_t **dnp);
int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/sa.h Tue Mar 16 09:43:38 2010 -0600
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SA_H
+#define _SYS_SA_H
+
+#include <sys/dmu.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+ SA_UINT64_ARRAY,
+ SA_UINT32_ARRAY,
+ SA_UINT16_ARRAY,
+ SA_UINT8_ARRAY,
+ SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+ char *sa_name; /* attribute name */
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap; /* bswap functon enum */
+ sa_attr_type_t sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+ boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+ void *sa_data;
+ sa_data_locator_t *sa_data_func;
+ uint16_t sa_length;
+ sa_attr_type_t sa_attr;
+ /* the following are private to the sa framework */
+ void *sa_addr;
+ uint16_t sa_buftype;
+ uint16_t sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+ b[idx].sa_attr = attr;\
+ b[idx].sa_data_func = func; \
+ b[idx].sa_data = data; \
+ b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+ SA_HDL_SHARED,
+ SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+ sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+ uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+ uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void sa_update_user(sa_handle_t *, sa_handle_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+ int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init();
+void sa_cache_fini();
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h Tue Mar 16 09:43:38 2010 -0600
@@ -0,0 +1,288 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SA_IMPL_H
+#define _SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+ sa_attr_type_t sa_attr;
+ uint8_t sa_registered;
+ uint16_t sa_length;
+ sa_bswap_type_t sa_byteswap;
+ char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | unused | len | bswap | attr num |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16 0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ * ......
+ *
+ */
+
+#define ATTR_BSWAP(x) BF32_GET(x, 16, 8)
+#define ATTR_LENGTH(x) BF32_GET(x, 24, 16)
+#define ATTR_NUM(x) BF32_GET(x, 0, 16)
+#define ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+ BF64_SET(x, 24, 16, length); \
+ BF64_SET(x, 16, 8, bswap); \
+ BF64_SET(x, 0, 16, attr); \
+}
+
+#define TOC_OFF(x) BF32_GET(x, 0, 23)
+#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1)
+#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4)
+#define TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+ BF32_SET(x, 31, 1, 1); \
+ BF32_SET(x, 24, 7, len_idx); \
+ BF32_SET(x, 0, 24, offset); \
+}
+
+#define SA_LAYOUTS "LAYOUTS"
+#define SA_REGISTRY "REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+ avl_node_t lot_num_node;
+ avl_node_t lot_hash_node;
+ uint64_t lot_num;
+ uint64_t lot_hash;
+ sa_attr_type_t *lot_attrs; /* array of attr #'s */
+ uint32_t lot_var_sizes; /* how many aren't fixed size */
+ uint32_t lot_attr_count; /* total attr count */
+ list_t lot_idx_tab; /* should be only a couple of entries */
+ int lot_instance; /* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+ list_node_t sa_next;
+ sa_lot_t *sa_layout;
+ uint16_t *sa_variable_lengths;
+ refcount_t sa_refcount;
+ uint32_t *sa_idx_tab; /* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read. The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout. Both
+ * of these tree's are interconnected. The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+ kmutex_t sa_lock;
+ boolean_t sa_need_attr_registration;
+ boolean_t sa_force_spill;
+ uint64_t sa_master_obj;
+ uint64_t sa_reg_attr_obj;
+ uint64_t sa_layout_attr_obj;
+ int sa_num_attrs;
+ sa_attr_table_t *sa_attr_table; /* private attr table */
+ sa_update_cb_t *sa_update_cb;
+ avl_tree_t sa_layout_num_tree; /* keyed by layout number */
+ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */
+ int sa_user_table_sz;
+ sa_attr_type_t *sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define SA_MAGIC 0x2F505A /* ZFS SA */
+typedef struct sa_hdr_phys {
+ uint32_t sa_magic;
+ uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
+ uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
+ /* ... Data follows the lengths. */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+
+#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+ BF32_SET_SB(x, 10, 6, 3, 0, size); \
+ BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+ SA_BONUS = 1,
+ SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+ SA_LOOKUP,
+ SA_UPDATE,
+ SA_ADD,
+ SA_REPLACE,
+ SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+ kmutex_t sa_lock;
+ dmu_buf_t *sa_bonus;
+ dmu_buf_t *sa_spill;
+ objset_t *sa_os;
+ void *sa_userp;
+ sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */
+ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */
+};
+
+#define SA_GET_DB(hdl, type) \
+ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define SA_GET_HDR(hdl, type) \
+ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+ type))->db.db_data))
+
+#define SA_IDX_TAB_GET(hdl, type) \
+ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define IS_SA_BONUSTYPE(a) \
+ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define SA_BONUSTYPE_FROM_DB(db) \
+ (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype)
+
+#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define SA_LAYOUT_NUM(x, type) \
+ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+ SA_REGISTERED_LEN(sa, attr))
+
+#define SA_SET_HDR(hdr, num, size) \
+ { \
+ hdr->sa_magic = SA_MAGIC; \
+ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+ }
+
+#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+ { \
+ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+ bulk.sa_buftype = type; \
+ bulk.sa_addr = \
+ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+ (uintptr_t)hdr); \
+}
+
+#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+ sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+ uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+ uint16_t *, sa_hdr_phys_t *);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SA_IMPL_H */
--- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@
#include <sys/acl.h>
#include <sys/dmu.h>
#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
#ifdef __cplusplus
extern "C" {
@@ -106,12 +107,18 @@
#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
+/*
+ * Size of ACL count is always 2 bytes.
+ * Necessary to for dealing with both V0 ACL and V1 ACL layout
+ */
+#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t))
+
typedef struct zfs_acl_phys {
uint64_t z_acl_extern_obj; /* ext acl pieces */
uint32_t z_acl_size; /* Number of bytes in ACL */
uint16_t z_acl_version; /* acl version */
uint16_t z_acl_count; /* ace count */
- uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+ uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
} zfs_acl_phys_t;
typedef struct acl_ops {
@@ -146,21 +153,26 @@
void *z_allocdata; /* pointer to kmem allocated memory */
size_t z_allocsize; /* Size of blob in bytes */
size_t z_size; /* length of ACL data */
- int z_ace_count; /* number of ACEs in this acl node */
+ uint64_t z_ace_count; /* number of ACEs in this acl node */
int z_ace_idx; /* ace iterator positioned on */
} zfs_acl_node_t;
typedef struct zfs_acl {
- int z_acl_count; /* Number of ACEs */
+ uint64_t z_acl_count; /* Number of ACEs */
size_t z_acl_bytes; /* Number of bytes in ACL */
uint_t z_version; /* version of ACL */
void *z_next_ace; /* pointer to next ACE */
- int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
+ uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */
zfs_acl_node_t *z_curr_node; /* current node iterator is handling */
list_t z_acl; /* chunks of ACE data */
acl_ops_t z_ops; /* ACL operations */
} zfs_acl_t;
+typedef struct acl_locator_cb {
+ zfs_acl_t *cb_aclp;
+ zfs_acl_node_t *cb_acl_node;
+} zfs_acl_locator_cb_t;
+
#define ACL_DATA_ALLOCED 0x1
#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
@@ -174,6 +186,10 @@
struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */
} zfs_acl_ids_t;
+#define ZFS_EXTERNAL_ACL(zp) \
+ (zp->z_is_sa ? 0 : zfs_external_acl(zp))
+#define ZNODE_ACL_VERSION(zp) \
+ (zp->z_is_sa ? ZFS_ACL_VERSION_FUID : zfs_znode_acl_version(zp))
/*
* Property values for acl_mode and acl_inherit.
*
@@ -215,6 +231,14 @@
int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
struct zfs_fuid_info **, zfs_acl_t **);
int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
+uint64_t zfs_external_acl(struct znode *);
+int zfs_znode_acl_version(struct znode *);
+int zfs_acl_size(struct znode *, int *);
+zfs_acl_t *zfs_acl_alloc(int);
+zfs_acl_node_t *zfs_acl_node_alloc(size_t);
+void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
+void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, uint64_t *);
#endif
--- a/usr/src/uts/common/fs/zfs/sys/zfs_dir.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -57,7 +57,7 @@
extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
pathname_t *);
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
- uint_t, znode_t **, int, zfs_acl_ids_t *);
+ uint_t, znode_t **, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *);
extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h Tue Mar 16 09:43:38 2010 -0600
@@ -71,12 +71,13 @@
#define DMU_BACKUP_FEATURE_DEDUP (0x1)
#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2)
+#define DMU_BACKUP_FEATURE_SA_SPILL (0x4)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
- DMU_BACKUP_FEATURE_DEDUPPROPS)
+ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -118,7 +119,7 @@
enum {
DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
- DRR_NUMTYPES
+ DRR_SPILL, DRR_NUMTYPES
} drr_type;
uint32_t drr_payloadlen;
union {
@@ -188,6 +189,13 @@
uint8_t drr_pad2[6];
ddt_key_t drr_key; /* deduplication key */
} drr_write_byref;
+ struct drr_spill {
+ uint64_t drr_object;
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint64_t drr_pad[4]; /* needed for crypto */
+ /* spill data follows */
+ } drr_spill;
} drr_u;
} dmu_replay_record_t;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_sa.h Tue Mar 16 09:43:38 2010 -0600
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_SA_H
+#define _SYS_ZFS_SA_H
+
+#ifdef _KERNEL
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zil.h>
+
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the list of known attributes
+ * to the ZPL. The values of the actual
+ * attributes are not defined by the order
+ * the enums. It is controlled by the attribute
+ * registration mechanism. Two different file system
+ * could have different numeric values for the same
+ * attributes. this list is only used for dereferencing
+ * into the table that will hold the actual numeric value.
+ */
+typedef enum zpl_attr {
+ ZPL_ATIME,
+ ZPL_MTIME,
+ ZPL_CTIME,
+ ZPL_CRTIME,
+ ZPL_GEN,
+ ZPL_MODE,
+ ZPL_SIZE,
+ ZPL_PARENT,
+ ZPL_LINKS,
+ ZPL_XATTR,
+ ZPL_RDEV,
+ ZPL_FLAGS,
+ ZPL_UID,
+ ZPL_GID,
+ ZPL_PAD,
+ ZPL_ZNODE_ACL,
+ ZPL_DACL_COUNT,
+ ZPL_SYMLINK,
+ ZPL_SCANSTAMP,
+ ZPL_DACL_ACES,
+ ZPL_END
+} zpl_attr_t;
+
+#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108
+#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \
+ sizeof (zfs_acl_phys_t))
+
+#define SA_MODE_OFFSET 0
+#define SA_SIZE_OFFSET 8
+#define SA_GEN_OFFSET 16
+#define SA_UID_OFFSET 24
+#define SA_GID_OFFSET 32
+#define SA_PARENT_OFFSET 40
+
+extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
+extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
+
+/*
+ * This is a deprecated data structure that only exists for
+ * dealing with file systems create prior to ZPL version 5.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_zap; /* 144 - extra attributes */
+ uint64_t zp_pad[3]; /* 152 - future */
+ zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we use this space for the following:
+ * - symbolic links
+ * - 32-byte anti-virus scanstamp (regular files only)
+ */
+} znode_phys_t;
+
+#ifdef _KERNEL
+int zfs_sa_readlink(struct znode *, uio_t *);
+void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
+void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
+void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
+void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
+void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_SA_H */
--- a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -31,6 +31,7 @@
#include <sys/list.h>
#include <sys/vfs.h>
#include <sys/zil.h>
+#include <sys/sa.h>
#include <sys/rrwlock.h>
#include <sys/zfs_ioctl.h>
@@ -39,6 +40,7 @@
#endif
typedef struct zfsvfs zfsvfs_t;
+struct znode;
struct zfsvfs {
vfs_t *z_vfs; /* generic fs struct */
@@ -73,11 +75,13 @@
boolean_t z_vscan; /* virus scan on/off */
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
+ boolean_t z_use_sa; /* version allow system attributes */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
uint64_t z_userquota_obj;
uint64_t z_groupquota_obj;
+ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
#define ZFS_OBJ_MTX_SZ 64
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
};
@@ -140,8 +144,10 @@
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t quota);
-extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
- boolean_t isgroup, uint64_t fuid);
+extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
+ boolean_t isgroup);
+extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
+ uint64_t fuid);
extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
extern void zfsvfs_free(zfsvfs_t *zfsvfs);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -32,8 +32,10 @@
#include <sys/attr.h>
#include <sys/list.h>
#include <sys/dmu.h>
+#include <sys/sa.h>
#include <sys/zfs_vfsops.h>
#include <sys/rrwlock.h>
+#include <sys/zfs_sa.h>
#endif
#include <sys/zfs_acl.h>
#include <sys/zil.h>
@@ -59,12 +61,14 @@
#define ZFS_AV_MODIFIED 0x0000040000000000
#define ZFS_REPARSE 0x0000080000000000
-#define ZFS_ATTR_SET(zp, attr, value) \
+#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
{ \
if (value) \
- zp->z_phys->zp_flags |= attr; \
+ pflags |= attr; \
else \
- zp->z_phys->zp_flags &= ~attr; \
+ pflags &= ~attr; \
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
+ &pflags, sizeof (pflags), tx)); \
}
/*
@@ -80,6 +84,27 @@
#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
+#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME]
+#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME]
+#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME]
+#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME]
+#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN]
+#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES]
+#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR]
+#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK]
+#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV]
+#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP]
+#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID]
+#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID]
+#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT]
+#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS]
+#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE]
+#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT]
+#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
+#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
+#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
+#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
+
/*
* Is ID ephemeral?
*/
@@ -88,8 +113,10 @@
/*
* Should we use FUIDs?
*/
-#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\
+#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \
spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \
+ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
#define MASTER_NODE_OBJ 1
@@ -104,6 +131,7 @@
#define ZPL_VERSION_STR "VERSION"
#define ZFS_FUID_TABLES "FUID"
#define ZFS_SHARES_DIR "SHARES"
+#define ZFS_SA_ATTRS "SA_ATTRS"
#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
@@ -132,42 +160,6 @@
#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
/*
- * This is the persistent portion of the znode. It is stored
- * in the "bonus buffer" of the file. Short symbolic links
- * are also stored in the bonus buffer.
- */
-typedef struct znode_phys {
- uint64_t zp_atime[2]; /* 0 - last file access time */
- uint64_t zp_mtime[2]; /* 16 - last file modification time */
- uint64_t zp_ctime[2]; /* 32 - last file change time */
- uint64_t zp_crtime[2]; /* 48 - creation time */
- uint64_t zp_gen; /* 64 - generation (txg of creation) */
- uint64_t zp_mode; /* 72 - file mode bits */
- uint64_t zp_size; /* 80 - size of file */
- uint64_t zp_parent; /* 88 - directory parent (`..') */
- uint64_t zp_links; /* 96 - number of links to file */
- uint64_t zp_xattr; /* 104 - DMU object for xattrs */
- uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
- uint64_t zp_flags; /* 120 - persistent flags */
- uint64_t zp_uid; /* 128 - file owner */
- uint64_t zp_gid; /* 136 - owning group */
- uint64_t zp_zap; /* 144 - extra attributes */
- uint64_t zp_pad[3]; /* 152 - future */
- zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */
- /*
- * Data may pad out any remaining bytes in the znode buffer, eg:
- *
- * |<---------------------- dnode_phys (512) ------------------------>|
- * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
- * |<---- znode (264) ---->|<---- data (56) ---->|
- *
- * At present, we use this space for the following:
- * - symbolic links
- * - 32-byte anti-virus scanstamp (regular files only)
- */
-} znode_phys_t;
-
-/*
* Directory entry locks control access to directory entries.
* They are used to protect creates, deletes, and renames.
* Each directory znode has a mutex and a list of locked names.
@@ -200,16 +192,20 @@
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
uint64_t z_last_itx; /* last ZIL itx on this znode */
- uint64_t z_gen; /* generation (same as zp_gen) */
+ uint64_t z_gen; /* generation (cached) */
+ uint64_t z_size; /* file size (cached) */
+ uint64_t z_atime[2]; /* atime (cached) */
+ uint64_t z_links; /* file links (cached) */
+ uint64_t z_pflags; /* pflags (cached) */
+ uid_t z_uid; /* uid mapped (cached) */
+ uid_t z_gid; /* gid mapped (cached) */
+ mode_t z_mode; /* mode (cached) */
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
zfs_acl_t *z_acl_cached; /* cached acl */
list_node_t z_link_node; /* all znodes in fs link */
- /*
- * These are dmu managed fields.
- */
- znode_phys_t *z_phys; /* pointer to persistent znode */
- dmu_buf_t *z_dbuf; /* buffer containing the z_phys */
+ sa_handle_t *z_sa_hdl; /* handle to sa data */
+ boolean_t z_is_sa; /* are we native sa? */
} znode_t;
@@ -252,7 +248,7 @@
#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
#define ZFS_VERIFY_ZP(zp) \
- if ((zp)->z_dbuf == NULL) { \
+ if ((zp)->z_sa_hdl == NULL) { \
ZFS_EXIT((zp)->z_zfsvfs); \
return (EIO); \
} \
@@ -294,14 +290,14 @@
#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
- zfs_time_stamper(zp, ACCESSED, NULL)
+ zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
extern int zfs_init_fs(zfsvfs_t *, znode_t **);
extern void zfs_set_dataprop(objset_t *);
extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
dmu_tx_t *tx);
-extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
-extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
+ uint64_t [2], boolean_t);
extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
extern void zfs_znode_init(void);
@@ -340,7 +336,7 @@
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
-extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c Tue Mar 16 09:43:38 2010 -0600
@@ -50,6 +50,7 @@
#include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include "fs/fs_subr.h"
#include <acl/acl_common.h>
@@ -321,6 +322,82 @@
zfs_ace_fuid_data
};
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ VERIFY(0 == sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys)));
+
+ return (acl_phys.z_acl_extern_obj);
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa) {
+ return (ZFS_ACL_VERSION_FUID);
+ } else {
+ VERIFY(0 == sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+ &acl_phys, sizeof (acl_phys)));
+ return (acl_phys.z_acl_version);
+ }
+}
+
static int
zfs_acl_version(int version)
{
@@ -336,7 +413,7 @@
return (zfs_acl_version(zp->z_zfsvfs->z_version));
}
-static zfs_acl_t *
+zfs_acl_t *
zfs_acl_alloc(int vers)
{
zfs_acl_t *aclp;
@@ -352,7 +429,7 @@
return (aclp);
}
-static zfs_acl_node_t *
+zfs_acl_node_t *
zfs_acl_node_alloc(size_t bytes)
{
zfs_acl_node_t *aclnode;
@@ -463,6 +540,8 @@
{
zfs_acl_node_t *aclnode;
+ ASSERT(aclp);
+
if (start == NULL) {
aclnode = list_head(&aclp->z_acl);
if (aclnode == NULL)
@@ -509,6 +588,7 @@
*who = aclp->z_ops.ace_who_get(acep);
aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
aclnode->z_ace_idx++;
+
return ((void *)acep);
}
return (NULL);
@@ -542,7 +622,7 @@
*/
int
zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
- void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
zfs_fuid_info_t **fuidp, cred_t *cr)
{
int i;
@@ -773,8 +853,8 @@
* Determine mode of file based on ACL.
* Also, create FUIDs for any User/Group ACEs
*/
-static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, uint64_t *pflags)
{
int entry_type;
mode_t mode;
@@ -785,7 +865,7 @@
uint32_t access_mask;
boolean_t an_exec_denied = B_FALSE;
- mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
while (acep = zfs_acl_next_ace(aclp, acep, &who,
&access_mask, &iflags, &type)) {
@@ -930,48 +1010,13 @@
an_exec_denied = B_TRUE;
if (an_exec_denied)
- zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
else
- zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+ *pflags |= ZFS_NO_EXECS_DENIED;
return (mode);
}
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
-{
- zfs_acl_t *aclp;
- zfs_acl_node_t *aclnode;
-
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-
- /*
- * Version 0 to 1 znode_acl_phys has the size/count fields swapped.
- * Version 0 didn't have a size field, only a count.
- */
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size;
- aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count);
- } else {
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
- aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size;
- }
-
- aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0);
- aclnode->z_ace_count = aclp->z_acl_count;
- if (will_modify) {
- bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata,
- aclp->z_acl_bytes);
- } else {
- aclnode->z_size = aclp->z_acl_bytes;
- aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0];
- }
-
- list_insert_head(&aclp->z_acl, aclnode);
-
- return (aclp);
-}
-
/*
* Read an external acl object. If the intent is to modify, always
* create a new acl and leave any cached acl in place.
@@ -979,12 +1024,13 @@
static int
zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
{
- uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
zfs_acl_t *aclp;
- size_t aclsize;
- size_t acl_count;
+ int aclsize;
+ int acl_count;
zfs_acl_node_t *aclnode;
- int error;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
@@ -993,48 +1039,69 @@
return (0);
}
- if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
- *aclpp = zfs_acl_node_read_internal(zp, will_modify);
- if (!will_modify)
- zp->z_acl_cached = *aclpp;
- return (0);
- }
-
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
- if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
- zfs_acl_phys_v0_t *zacl0 =
- (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl;
-
- aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count);
- acl_count = zacl0->z_acl_count;
- } else {
- aclsize = zp->z_phys->zp_acl.z_acl_size;
- acl_count = zp->z_phys->zp_acl.z_acl_count;
- if (aclsize == 0)
- aclsize = acl_count * sizeof (zfs_ace_t);
- }
- aclnode = zfs_acl_node_alloc(aclsize);
- list_insert_head(&aclp->z_acl, aclnode);
- error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
- aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
- aclnode->z_ace_count = acl_count;
+ version = ZNODE_ACL_VERSION(zp);
+
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0)
+ return (error);
+
+ aclp = zfs_acl_alloc(version);
+
aclp->z_acl_count = acl_count;
aclp->z_acl_bytes = aclsize;
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(zp->z_zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
if (error != 0) {
zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = EIO;
return (error);
}
+ list_insert_head(&aclp->z_acl, aclnode);
+
*aclpp = aclp;
if (!will_modify)
zp->z_acl_cached = aclp;
return (0);
}
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
/*
* common code for setting ACLs.
*
@@ -1045,28 +1112,33 @@
int
zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
{
- int error;
- znode_phys_t *zphys = zp->z_phys;
- zfs_acl_phys_t *zacl = &zphys->zp_acl;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
- uint64_t off = 0;
- dmu_object_type_t otype;
- zfs_acl_node_t *aclnode;
-
- dmu_buf_will_dirty(zp->z_dbuf, tx);
+ int error;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+
+ mode = zp->z_mode;
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags);
+
+ zp->z_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
if (zp->z_acl_cached) {
zfs_acl_free(zp->z_acl_cached);
zp->z_acl_cached = NULL;
}
- zphys->zp_mode = zfs_mode_compute(zp, aclp);
-
/*
- * Decide which object type to use. If we are forced to
- * use old ACL format then transform ACL into zfs_oldace_t
- * layout.
+ * Upgrade needed?
*/
if (!zfsvfs->z_use_fuids) {
otype = DMU_OT_OLDACL;
@@ -1078,84 +1150,113 @@
otype = DMU_OT_ACL;
}
- if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- /*
- * If ACL was previously external and we are now
- * converting to new ACL format then release old
- * ACL object and create a new one.
- */
- if (aoid && aclp->z_version != zacl->z_acl_version) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- aoid = 0;
- }
- if (aoid == 0) {
- aoid = dmu_object_alloc(zfsvfs->z_os,
- otype, aclp->z_acl_bytes,
- otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE,
- otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx);
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ zfs_acl_phys_t acl_phys;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
} else {
- (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
- aclp->z_acl_bytes, 0, tx);
- }
- zphys->zp_acl.z_acl_extern_obj = aoid;
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- dmu_write(zfsvfs->z_os, aoid, off,
- aclnode->z_size, aclnode->z_acldata, tx);
- off += aclnode->z_size;
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
}
- } else {
- void *start = zacl->z_ace_data;
/*
- * Migrating back embedded?
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
*/
- if (zphys->zp_acl.z_acl_extern_obj) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- zphys->zp_acl.z_acl_extern_obj = 0;
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
}
-
- for (aclnode = list_head(&aclp->z_acl); aclnode;
- aclnode = list_next(&aclp->z_acl, aclnode)) {
- if (aclnode->z_ace_count == 0)
- continue;
- bcopy(aclnode->z_acldata, start, aclnode->z_size);
- start = (caddr_t)start + aclnode->z_size;
- }
+ acl_phys.z_acl_version = aclp->z_version;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
}
/*
- * If Old version then swap count/bytes to match old
- * layout of znode_acl_phys_t.
- */
- if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
- zphys->zp_acl.z_acl_size = aclp->z_acl_count;
- zphys->zp_acl.z_acl_count = aclp->z_acl_bytes;
- } else {
- zphys->zp_acl.z_acl_size = aclp->z_acl_bytes;
- zphys->zp_acl.z_acl_count = aclp->z_acl_count;
- }
-
- zphys->zp_acl.z_acl_version = aclp->z_version;
-
- /*
* Replace ACL wide bits, but first clear them.
*/
- zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS;
-
- zp->z_phys->zp_flags |= aclp->z_hints;
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+ zp->z_pflags |= aclp->z_hints;
if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
- zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
-
- return (0);
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
}
/*
@@ -1643,11 +1744,20 @@
mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
+
*aclp = NULL;
error = zfs_acl_node_read(zp, aclp, B_TRUE);
if (error == 0) {
- (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
- zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
+ uint64_t owner;
+ if (IS_EPHEMERAL(zp->z_uid))
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_UID(zp->z_zfsvfs),
+ &owner, sizeof (owner))) != 0)
+ return (error);
+ else
+ owner = (uint64_t)zp->z_uid;
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(zp->z_zfsvfs, owner, mode, *aclp);
}
mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock);
@@ -1716,7 +1826,7 @@
*need_chmod = B_TRUE;
pacep = NULL;
aclp = zfs_acl_alloc(paclp->z_version);
- if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD)
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || vtype == VLNK)
return (aclp);
while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
&access_mask, &iflags, &type)) {
@@ -1837,6 +1947,8 @@
zfs_acl_t *paclp;
gid_t gid;
boolean_t need_chmod = B_TRUE;
+ boolean_t inherited = B_FALSE;
+ uint64_t parentgid;
bzero(acl_ids, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1845,7 +1957,6 @@
if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
&acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
return (error);
-
/*
* Determine uid and gid.
*/
@@ -1859,6 +1970,12 @@
ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
} else {
+ if (IS_EPHEMERAL(dzp->z_gid))
+ VERIFY(0 == sa_lookup(dzp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
+ &parentgid, sizeof (parentgid)));
+ else
+ parentgid = (uint64_t)dzp->z_gid;
+
acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
cr, &acl_ids->z_fuidp);
acl_ids->z_fgid = 0;
@@ -1867,17 +1984,17 @@
(uint64_t)vap->va_gid,
cr, ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid;
- if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
+ if (acl_ids->z_fgid != parentgid &&
!groupmember(vap->va_gid, cr) &&
secpolicy_vnode_create_gid(cr) != 0)
acl_ids->z_fgid = 0;
}
if (acl_ids->z_fgid == 0) {
- if (dzp->z_phys->zp_mode & S_ISGID) {
+ if (dzp->z_mode & S_ISGID) {
char *domain;
uint32_t rid;
- acl_ids->z_fgid = dzp->z_phys->zp_gid;
+ acl_ids->z_fgid = parentgid;
gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
cr, ZFS_GROUP);
@@ -1907,7 +2024,7 @@
* file's new group, clear the file's set-GID bit.
*/
- if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
(vap->va_type == VDIR)) {
acl_ids->z_mode |= S_ISGID;
} else {
@@ -1919,26 +2036,35 @@
if (acl_ids->z_aclp == NULL) {
mutex_enter(&dzp->z_lock);
if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
- (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
- !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ (dzp->z_pflags & ZFS_INHERIT_ACE)) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
mutex_enter(&dzp->z_acl_lock);
VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
- mutex_exit(&dzp->z_acl_lock);
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+ mutex_exit(&dzp->z_acl_lock);
+ inherited = B_TRUE;
} else {
acl_ids->z_aclp =
zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
}
mutex_exit(&dzp->z_lock);
if (need_chmod) {
- acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
+ acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
ZFS_ACL_AUTO_INHERIT : 0;
zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
acl_ids->z_mode, acl_ids->z_aclp);
}
}
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
return (0);
}
@@ -1959,8 +2085,8 @@
boolean_t
zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
{
- return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
- zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
+ return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+ zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
}
/*
@@ -1978,12 +2104,12 @@
mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+ if (mask == 0)
+ return (ENOSYS);
+
if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
return (error);
- if (mask == 0)
- return (ENOSYS);
-
mutex_enter(&zp->z_acl_lock);
error = zfs_acl_node_read(zp, &aclp, B_FALSE);
@@ -1995,8 +2121,7 @@
/*
* Scan ACL to determine number of ACEs
*/
- if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) &&
- !(mask & VSA_ACE_ALLTYPES)) {
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
void *zacep = NULL;
uint64_t who;
uint32_t access_mask;
@@ -2017,7 +2142,7 @@
}
vsecp->vsa_aclcnt = count;
} else
- count = aclp->z_acl_count;
+ count = (int)aclp->z_acl_count;
if (mask & VSA_ACECNT) {
vsecp->vsa_aclcnt = count;
@@ -2051,11 +2176,11 @@
}
if (mask & VSA_ACE_ACLFLAGS) {
vsecp->vsa_aclflags = 0;
- if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED)
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
vsecp->vsa_aclflags |= ACL_DEFAULTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED)
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
vsecp->vsa_aclflags |= ACL_PROTECTED;
- if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT)
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
}
@@ -2137,7 +2262,7 @@
if (mask == 0)
return (ENOSYS);
- if (zp->z_phys->zp_flags & ZFS_IMMUTABLE)
+ if (zp->z_pflags & ZFS_IMMUTABLE)
return (EPERM);
if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
@@ -2153,37 +2278,40 @@
* existing flags.
*/
if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
- aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
}
top:
mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL? */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- zp->z_phys->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx,
- zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, aclp->z_acl_bytes);
- }
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
- }
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if (ZFS_EXTERNAL_ACL(zp)) {
+ if (zfsvfs->z_version <= ZPL_VERSION_SA &&
+ ZNODE_ACL_VERSION(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+ DMU_OBJECT_END);
+ } else {
+ dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp),
+ 0, aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+ }
+
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
mutex_exit(&zp->z_acl_lock);
@@ -2206,7 +2334,6 @@
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
if (fuidp)
@@ -2239,19 +2366,19 @@
*/
if ((v4_mode & WRITE_MASK_DATA) &&
(((ZTOV(zp)->v_type != VDIR) &&
- (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+ (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
(ZTOV(zp)->v_type == VDIR &&
- (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
+ (zp->z_pflags & ZFS_IMMUTABLE)))) {
return (EPERM);
}
if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
- (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+ (zp->z_pflags & ZFS_NOUNLINK)) {
return (EPERM);
}
if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
- (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
return (EACCES);
}
@@ -2298,10 +2425,7 @@
uint32_t deny_mask = 0;
zfs_ace_hdr_t *acep = NULL;
boolean_t checkit;
- uid_t fowner;
- uid_t gowner;
-
- zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+ uint64_t gowner;
mutex_enter(&zp->z_acl_lock);
@@ -2311,6 +2435,12 @@
return (error);
}
+ ASSERT(zp->z_acl_cached);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs),
+ &gowner, sizeof (gowner))) != 0)
+ return (error);
+
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) {
uint32_t mask_matched;
@@ -2332,7 +2462,7 @@
switch (entry_type) {
case ACE_OWNER:
- if (uid == fowner)
+ if (uid == zp->z_uid)
checkit = B_TRUE;
break;
case OWNING_GROUP:
@@ -2410,17 +2540,14 @@
uint32_t have = ACE_ALL_PERMS;
if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
- uid_t owner;
-
- owner = zfs_fuid_map_id(zp->z_zfsvfs,
- zp->z_phys->zp_uid, cr, ZFS_OWNER);
-
return (
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
- secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
- secpolicy_vnode_chown(cr, owner) == 0 ||
- secpolicy_vnode_setdac(cr, owner) == 0 ||
+ secpolicy_vnode_access(cr, ZTOV(zp),
+ zp->z_uid, VREAD) == 0 || secpolicy_vnode_access(cr,
+ ZTOV(zp), zp->z_uid, VWRITE) == 0 ||
+ secpolicy_vnode_access(cr, ZTOV(zp),
+ zp->z_uid, VEXEC) == 0 ||
+ secpolicy_vnode_chown(cr, zp->z_uid) == 0 ||
+ secpolicy_vnode_setdac(cr, zp->z_uid) == 0 ||
secpolicy_vnode_remove(cr) == 0);
}
return (B_TRUE);
@@ -2479,38 +2606,33 @@
boolean_t owner = B_FALSE;
boolean_t groupmbr = B_FALSE;
boolean_t is_attr;
- uid_t fowner;
- uid_t gowner;
uid_t uid = crgetuid(cr);
int error;
- if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
return (EACCES);
- is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
(ZTOV(zdp)->v_type == VDIR));
if (is_attr)
goto slow;
+
mutex_enter(&zdp->z_acl_lock);
- if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
mutex_exit(&zdp->z_acl_lock);
return (0);
}
- if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
- FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+ if (IS_EPHEMERAL(zdp->z_uid) != 0 || IS_EPHEMERAL(zdp->z_gid) != 0) {
mutex_exit(&zdp->z_acl_lock);
goto slow;
}
- fowner = (uid_t)zdp->z_phys->zp_uid;
- gowner = (uid_t)zdp->z_phys->zp_gid;
-
- if (uid == fowner) {
+ if (uid == zdp->z_uid) {
owner = B_TRUE;
- if (zdp->z_phys->zp_mode & S_IXUSR) {
+ if (zdp->z_mode & S_IXUSR) {
mutex_exit(&zdp->z_acl_lock);
return (0);
} else {
@@ -2518,9 +2640,9 @@
goto slow;
}
}
- if (groupmember(gowner, cr)) {
+ if (groupmember(zdp->z_gid, cr)) {
groupmbr = B_TRUE;
- if (zdp->z_phys->zp_mode & S_IXGRP) {
+ if (zdp->z_mode & S_IXGRP) {
mutex_exit(&zdp->z_acl_lock);
return (0);
} else {
@@ -2529,7 +2651,7 @@
}
}
if (!owner && !groupmbr) {
- if (zdp->z_phys->zp_mode & S_IXOTH) {
+ if (zdp->z_mode & S_IXOTH) {
mutex_exit(&zdp->z_acl_lock);
return (0);
}
@@ -2555,20 +2677,25 @@
uint32_t working_mode;
int error;
int is_attr;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
boolean_t check_privs;
znode_t *xzp;
znode_t *check_zp = zp;
- is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
- (ZTOV(zp)->v_type == VDIR));
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
/*
* If attribute then validate against base file
*/
if (is_attr) {
+ uint64_t parent;
+
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+ sizeof (parent))) != 0)
+ return (error);
+
if ((error = zfs_zget(zp->z_zfsvfs,
- zp->z_phys->zp_parent, &xzp)) != 0) {
+ parent, &xzp)) != 0) {
return (error);
}
@@ -2607,12 +2734,8 @@
}
if (error && check_privs) {
- uid_t owner;
mode_t checkmode = 0;
- owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr,
- ZFS_OWNER);
-
/*
* First check for implicit owner permission on
* read_acl/read_attributes
@@ -2622,7 +2745,7 @@
ASSERT(working_mode != 0);
if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
- owner == crgetuid(cr)))
+ zp->z_uid == crgetuid(cr)))
working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
@@ -2636,19 +2759,19 @@
if (checkmode)
error = secpolicy_vnode_access(cr, ZTOV(check_zp),
- owner, checkmode);
+ zp->z_uid, checkmode);
if (error == 0 && (working_mode & ACE_WRITE_OWNER))
- error = secpolicy_vnode_chown(cr, owner);
+ error = secpolicy_vnode_chown(cr, zp->z_uid);
if (error == 0 && (working_mode & ACE_WRITE_ACL))
- error = secpolicy_vnode_setdac(cr, owner);
+ error = secpolicy_vnode_setdac(cr, zp->z_uid);
if (error == 0 && (working_mode &
(ACE_DELETE|ACE_DELETE_CHILD)))
error = secpolicy_vnode_remove(cr);
if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
- error = secpolicy_vnode_chown(cr, owner);
+ error = secpolicy_vnode_chown(cr, zp->z_uid);
}
if (error == 0) {
/*
@@ -2693,12 +2816,9 @@
mode_t missing_perms, cred_t *cr)
{
int error;
- uid_t downer;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER);
-
- error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms);
+
+ error = secpolicy_vnode_access(cr, ZTOV(dzp),
+ dzp->z_uid, missing_perms);
if (error == 0)
error = zfs_sticky_remove_access(dzp, zp, cr);
@@ -2765,7 +2885,7 @@
* to determine what was found.
*/
- if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
return (EPERM);
/*
@@ -2835,7 +2955,7 @@
int add_perm;
int error;
- if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
return (EACCES);
add_perm = (ZTOV(szp)->v_type == VDIR) ?
--- a/usr/src/uts/common/fs/zfs/zfs_byteswap.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_byteswap.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,6 +27,7 @@
#include <sys/vfs.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
#include <sys/zfs_acl.h>
void
--- a/usr/src/uts/common/fs/zfs/zfs_ctldir.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -215,6 +215,7 @@
{
vnode_t *vp, *rvp;
zfsctl_node_t *zcp;
+ uint64_t crtime[2];
ASSERT(zfsvfs->z_ctldir == NULL);
@@ -225,7 +226,9 @@
zcp->zc_id = ZFSCTL_INO_ROOT;
VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
- ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
+ VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ &crtime, sizeof (crtime)));
+ ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
VN_RELE(rvp);
/*
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -52,6 +52,8 @@
#include <sys/atomic.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
#include <sys/dnlc.h>
#include <sys/extdirent.h>
@@ -286,8 +288,10 @@
* See if there's an object by this name; if so, put a hold on it.
*/
if (flag & ZXATTR) {
- zoid = dzp->z_phys->zp_xattr;
- error = (zoid == 0 ? ENOENT : 0);
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? ENOENT : 0);
} else {
if (update)
vp = dnlc_lookup(ZTOV(dzp), name);
@@ -379,25 +383,29 @@
zfs_dirlock_t *dl;
znode_t *zp;
int error = 0;
+ uint64_t parent;
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
*vpp = ZTOV(dzp);
VN_HOLD(*vpp);
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+
/*
* If we are a snapshot mounted under .zfs, return
* the vp for the snapshot directory.
*/
- if (dzp->z_phys->zp_parent == dzp->z_id &&
- zfsvfs->z_parent != zfsvfs) {
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+ if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
"snapshot", vpp, NULL, 0, NULL, kcred,
NULL, NULL, NULL);
return (error);
}
rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+ error = zfs_zget(zfsvfs, parent, &zp);
if (error == 0)
*vpp = ZTOV(zp);
rw_exit(&dzp->z_parent_lock);
@@ -445,7 +453,7 @@
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ASSERT(zp->z_unlinked);
- ASSERT3U(zp->z_phys->zp_links, ==, 0);
+ ASSERT(zp->z_links == 0);
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -540,10 +548,12 @@
(ZTOV(xzp)->v_type == VLNK));
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
- dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -576,15 +586,16 @@
znode_t *xzp = NULL;
dmu_tx_t *tx;
uint64_t acl_obj;
+ uint64_t xattr_obj;
int error;
+ ASSERT(zp->z_links == 0);
ASSERT(ZTOV(zp)->v_count == 0);
- ASSERT(zp->z_phys->zp_links == 0);
/*
* If this is an attribute directory, purge its contents.
*/
- if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
+ if (ZTOV(zp)->v_type == VDIR && (zp->z_pflags & ZFS_XATTR)) {
if (zfs_purgedir(zp) != 0) {
/*
* Not enough space to delete some xattrs.
@@ -613,12 +624,14 @@
* If the file has extended attributes, we're going to unlink
* the xattr dir.
*/
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
ASSERT(error == 0);
}
- acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+ acl_obj = ZFS_EXTERNAL_ACL(zp);
/*
* Set up the final transaction.
@@ -627,11 +640,13 @@
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
if (xzp) {
- dmu_tx_hold_bonus(tx, xzp->z_id);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
if (acl_obj)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
/*
@@ -646,10 +661,12 @@
}
if (xzp) {
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ ASSERT(error == 0);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
- xzp->z_phys->zp_links = 0; /* no more links to it */
+ xzp->z_links = 0; /* no more links to it */
+ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &xzp->z_links, sizeof (xzp->z_links), tx));
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
}
@@ -667,11 +684,12 @@
}
static uint64_t
-zfs_dirent(znode_t *zp)
+zfs_dirent(znode_t *zp, uint64_t mode)
{
uint64_t de = zp->z_id;
+
if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
- de |= IFTODT((zp)->z_phys->zp_mode) << 60;
+ de |= IFTODT(mode) << 60;
return (de);
}
@@ -682,12 +700,15 @@
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
{
znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
uint64_t value;
int zp_is_dir = (vp->v_type == VDIR);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
int error;
- dmu_buf_will_dirty(zp->z_dbuf, tx);
mutex_enter(&zp->z_lock);
if (!(flag & ZRENAMING)) {
@@ -696,22 +717,47 @@
mutex_exit(&zp->z_lock);
return (ENOENT);
}
- zp->z_phys->zp_links++;
+ zp->z_links++;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+
}
- zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
- if (!(flag & ZNEW))
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime, B_TRUE);
+ }
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
mutex_exit(&zp->z_lock);
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size++; /* one dirent added */
- dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ dzp->z_size++;
+ dzp->z_links += zp_is_dir;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
mutex_exit(&dzp->z_lock);
- value = zfs_dirent(zp);
+ value = zfs_dirent(zp, zp->z_mode);
error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
8, 1, &value, tx);
ASSERT(error == 0);
@@ -733,16 +779,18 @@
boolean_t *unlinkedp)
{
znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
int zp_is_dir = (vp->v_type == VDIR);
boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ int count = 0;
int error;
dnlc_remove(ZTOV(dzp), dl->dl_name);
if (!(flag & ZRENAMING)) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
if (vn_vfswlock(vp)) /* prevent new mounts on zp */
return (EBUSY);
@@ -752,35 +800,58 @@
}
mutex_enter(&zp->z_lock);
- if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
+
+ if (zp_is_dir && !zfs_dirempty(zp)) {
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
return (EEXIST);
}
- if (zp->z_phys->zp_links <= zp_is_dir) {
+
+ if (zp->z_links <= zp_is_dir) {
zfs_panic_recover("zfs: link count on %s is %u, "
"should be at least %u",
zp->z_vnode->v_path ? zp->z_vnode->v_path :
- "<unknown>", (int)zp->z_phys->zp_links,
+ "<unknown>", (int)zp->z_links,
zp_is_dir + 1);
- zp->z_phys->zp_links = zp_is_dir + 1;
+ zp->z_links = zp_is_dir + 1;
}
- if (--zp->z_phys->zp_links == zp_is_dir) {
+ if (--zp->z_links == zp_is_dir) {
zp->z_unlinked = B_TRUE;
- zp->z_phys->zp_links = 0;
+ zp->z_links = 0;
unlinked = B_TRUE;
} else {
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
}
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &zp->z_links, sizeof (zp->z_links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT(error == 0);
mutex_exit(&zp->z_lock);
vn_vfsunlock(vp);
}
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size--; /* one dirent removed */
- dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ dzp->z_size--; /* one dirent removed */
+ dzp->z_links -= zp_is_dir; /* ".." link from zp */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &dzp->z_links, sizeof (dzp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
mutex_exit(&dzp->z_lock);
if (zp->z_zfsvfs->z_norm) {
@@ -815,7 +886,7 @@
boolean_t
zfs_dirempty(znode_t *dzp)
{
- return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+ return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
}
int
@@ -827,6 +898,7 @@
int error;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ uint64_t parent;
*xvpp = NULL;
@@ -842,7 +914,9 @@
}
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
@@ -855,14 +929,18 @@
dmu_tx_abort(tx);
return (error);
}
- zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- ASSERT(xzp->z_phys->zp_parent == zp->z_id);
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_phys->zp_xattr = xzp->z_id;
+ if ((error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0)
+ return (0);
+
+ ASSERT(parent == zp->z_id);
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
xzp, "", NULL, acl_ids.z_fuidp, vap);
@@ -907,7 +985,6 @@
return (0);
}
- ASSERT(zp->z_phys->zp_xattr == 0);
if (!(flags & CREATE_XATTR_DIR)) {
zfs_dirent_unlock(dl);
@@ -962,20 +1039,14 @@
zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
{
uid_t uid;
- uid_t downer;
- uid_t fowner;
- zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
if (zdp->z_zfsvfs->z_replay)
return (0);
- if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
+ if ((zdp->z_mode & S_ISVTX) == 0)
return (0);
- downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
- fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
-
- if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+ if ((uid = crgetuid(cr)) == zdp->z_uid || uid == zp->z_uid ||
(ZTOV(zp)->v_type == VREG &&
zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
return (0);
--- a/usr/src/uts/common/fs/zfs/zfs_fuid.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_fuid.c Tue Mar 16 09:43:38 2010 -0600
@@ -389,10 +389,26 @@
void
zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
{
- *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
- cr, ZFS_OWNER);
- *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
- cr, ZFS_GROUP);
+ uint64_t fuid, fgid;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
+
+ if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs),
+ NULL, &fuid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs),
+ NULL, &fgid, 8);
+ VERIFY(0 == sa_bulk_lookup(zp->z_sa_hdl, bulk, count));
+ }
+ if (IS_EPHEMERAL(zp->z_uid))
+ *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+ else
+ *uidp = zp->z_uid;
+ if (IS_EPHEMERAL(zp->z_gid))
+ *gidp = zfs_fuid_map_id(zp->z_zfsvfs,
+ zp->z_gid, cr, ZFS_GROUP);
+ else
+ *gidp = zp->z_gid;
}
uid_t
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Tue Mar 16 09:43:38 2010 -0600
@@ -68,6 +68,7 @@
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "zfs_deleg.h"
+#include "zfs_comutil.h"
extern struct modlfs zfs_modlfs;
@@ -1954,20 +1955,10 @@
case ZFS_PROP_VERSION:
{
zfsvfs_t *zfsvfs;
- uint64_t maxzplver = ZPL_VERSION;
if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0)
break;
- if (zfs_earlier_version(dsname, SPA_VERSION_USERSPACE))
- maxzplver = ZPL_VERSION_USERSPACE - 1;
- if (zfs_earlier_version(dsname, SPA_VERSION_FUID))
- maxzplver = ZPL_VERSION_FUID - 1;
- if (intval > maxzplver) {
- zfsvfs_rele(zfsvfs, FTAG);
- return (ENOTSUP);
- }
-
err = zfs_set_version(zfsvfs, intval);
zfsvfs_rele(zfsvfs, FTAG);
@@ -2558,8 +2549,8 @@
*/
static int
zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
- boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
- boolean_t *is_ci)
+ boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+ nvlist_t *zplprops, boolean_t *is_ci)
{
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
@@ -2595,6 +2586,7 @@
*/
if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
(zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+ (zplver >= ZPL_VERSION_SA && !sa_ok) ||
(zplver < ZPL_VERSION_NORMALIZATION &&
(norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
sense != ZFS_PROP_UNDEFINED)))
@@ -2636,11 +2628,13 @@
zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
- boolean_t fuids_ok = B_TRUE;
+ boolean_t fuids_ok, sa_ok;
uint64_t zplver = ZPL_VERSION;
objset_t *os = NULL;
char parentname[MAXNAMELEN];
char *cp;
+ spa_t *spa;
+ uint64_t spa_vers;
int error;
(void) strlcpy(parentname, dataset, sizeof (parentname));
@@ -2648,12 +2642,15 @@
ASSERT(cp != NULL);
cp[0] = '\0';
- if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
- zplver = ZPL_VERSION_USERSPACE - 1;
- if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
- zplver = ZPL_VERSION_FUID - 1;
- fuids_ok = B_FALSE;
- }
+ if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_vers = spa_version(spa);
+ spa_close(spa, FTAG);
+
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
/*
* Open parent object set so we can inherit zplprop values.
@@ -2661,7 +2658,7 @@
if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
return (error);
- error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops,
+ error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
zplprops, is_ci);
dmu_objset_rele(os, FTAG);
return (error);
@@ -2671,17 +2668,17 @@
zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
nvlist_t *zplprops, boolean_t *is_ci)
{
- boolean_t fuids_ok = B_TRUE;
+ boolean_t fuids_ok;
+ boolean_t sa_ok;
uint64_t zplver = ZPL_VERSION;
int error;
- if (spa_vers < SPA_VERSION_FUID) {
- zplver = ZPL_VERSION_FUID - 1;
- fuids_ok = B_FALSE;
- }
-
- error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops,
- zplprops, is_ci);
+ zplver = zfs_zpl_version_map(spa_vers);
+ fuids_ok = (zplver >= ZPL_VERSION_FUID);
+ sa_ok = (zplver >= ZPL_VERSION_SA);
+
+ error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+ createprops, zplprops, is_ci);
return (error);
}
--- a/usr/src/uts/common/fs/zfs/zfs_log.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -276,21 +276,25 @@
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
- lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
+ lr->lr_mode = zp->z_mode;
+ if (!IS_EPHEMERAL(zp->z_uid)) {
+ lr->lr_uid = (uint64_t)zp->z_uid;
} else {
lr->lr_uid = fuidp->z_fuid_owner;
}
- if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
- lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
+ if (!IS_EPHEMERAL(zp->z_gid)) {
+ lr->lr_gid = (uint64_t)zp->z_gid;
} else {
lr->lr_gid = fuidp->z_fuid_group;
}
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
- lr->lr_rdev = zp->z_phys->zp_rdev;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
+
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
+ sizeof (lr->lr_rdev)) != 0)
+ lr->lr_rdev = 0;
/*
* Fill in xvattr info if any
@@ -404,12 +408,13 @@
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- lr->lr_uid = zp->z_phys->zp_uid;
- lr->lr_gid = zp->z_phys->zp_gid;
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ lr->lr_uid = zp->z_uid;
+ lr->lr_gid = zp->z_gid;
+ lr->lr_mode = zp->z_mode;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+ sizeof (uint64_t));
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ lr->lr_crtime, sizeof (uint64_t) * 2);
bcopy(name, (char *)(lr + 1), namesize);
bcopy(link, (char *)(lr + 1) + namesize, linksize);
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -645,7 +645,7 @@
length = lr->lr_length;
eod = offset + length; /* end of data for this write */
- orig_eof = zp->z_phys->zp_size;
+ orig_eof = zp->z_size;
/* If it's a dmu_sync() block, write the whole block */
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@@ -667,8 +667,8 @@
* write needs to be there. So we write the whole block and
* reduce the eof.
*/
- if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
- zp->z_phys->zp_size = eod;
+ if (orig_eof < zp->z_size) /* file length grew ? */
+ zp->z_size = eod;
VN_RELE(ZTOV(zp));
@@ -695,9 +695,9 @@
return (error);
end = lr->lr_offset + lr->lr_length;
- if (end > zp->z_phys->zp_size) {
- ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
- zp->z_phys->zp_size = end;
+ if (end > zp->z_size) {
+ ASSERT3U(end - zp->z_size, <, zp->z_blksz);
+ zp->z_size = end;
}
VN_RELE(ZTOV(zp));
--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -112,7 +112,7 @@
* Range locking is also used by zvol and uses a
* dummied up znode. However, for zvol, we don't need to
* append or grow blocksize, and besides we don't have
- * a z_phys or z_zfsvfs - so skip that processing.
+ * a "sa" data or z_zfsvfs - so skip that processing.
*
* Yes, this is ugly, and would be solved by not handling
* grow or append in range lock code. If that was done then
@@ -125,14 +125,14 @@
* This is done under z_range_lock to avoid races.
*/
if (new->r_type == RL_APPEND)
- new->r_off = zp->z_phys->zp_size;
+ new->r_off = zp->z_size;
/*
* If we need to grow the block size then grab the whole
* file range. This is also done under z_range_lock to
* avoid races.
*/
- end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+ end_size = MAX(zp->z_size, new->r_off + len);
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
new->r_off = 0;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/zfs_sa.c Tue Mar 16 09:43:38 2010 -0600
@@ -0,0 +1,340 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively. The file system
+ * could have other attributes stored in files, but they will be
+ * ignored. The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+ {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+ {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+ {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+ {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+ {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+ {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+ {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+ {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+ {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+ {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+ {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+ {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+ {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+ {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+ {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+ {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+ {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+ {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+ {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+ {"ZPL_DACL_ACES", 0, SA_ACL, 0},
+ {NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+
+int
+zfs_sa_readlink(znode_t *zp, uio_t *uio)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ size_t bufsz;
+ int error;
+
+ bufsz = zp->z_size;
+ if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE < db->db_size) {
+ error = uiomove((caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp;
+ if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
+ 0, FTAG, &dbp)) == 0) {
+ error = uiomove(dbp->db_data,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ }
+ return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+ if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+ VERIFY(dmu_set_bonus(db,
+ len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+ if (len) {
+ bcopy(link, (caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE, len);
+ }
+ } else {
+ dmu_buf_t *dbp;
+
+ zfs_grow_blocksize(zp, len, tx);
+ VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
+ zp->z_id, 0, FTAG, &dbp));
+
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp, FTAG);
+ }
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa) {
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp)) != 0)
+ return;
+ } else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+ return;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ if (len <= doi.doi_bonus_size) {
+ (void) memcpy(xoap->xoa_av_scanstamp,
+ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ sizeof (xoap->xoa_av_scanstamp));
+ }
+ }
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ xoptattr_t *xoap;
+
+ VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+ if (zp->z_is_sa)
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+ &xoap->xoa_av_scanstamp,
+ sizeof (xoap->xoa_av_scanstamp), tx));
+ else {
+ dmu_object_info_t doi;
+ dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+ int len;
+
+ sa_object_info(zp->z_sa_hdl, &doi);
+ len = sizeof (xoap->xoa_av_scanstamp) +
+ ZFS_OLD_ZNODE_PHYS_SIZE;
+ if (len > doi.doi_bonus_size)
+ VERIFY(dmu_set_bonus(db, len, tx) == 0);
+ (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+ zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ &zp->z_pflags, sizeof (uint64_t), tx));
+ }
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to know performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+ dmu_buf_t *db = sa_get_db(hdl);
+ znode_t *zp = sa_get_userdata(hdl);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ sa_bulk_attr_t bulk[20];
+ int count = 0;
+ sa_bulk_attr_t sa_attrs[20] = { 0 };
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t uid, gid, mode, rdev, xattr, parent;
+ uint64_t crtime[2], mtime[2], ctime[2];
+ zfs_acl_phys_t znode_acl;
+ char *slink = NULL;
+ char scanstamp[AV_SCANSTAMP_SZ];
+
+ /*
+ * No upgrade if ACL isn't cached
+ * since we won't know which locks are held
+ * and ready the ACL would require special "locked"
+ * interfaces that would be messy
+ */
+ if (zp->z_acl_cached == NULL)
+ return;
+
+ /* First do a bulk query of the attributes that aren't cached */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &znode_acl, 88);
+
+ if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+ return;
+
+
+ /*
+ * While the order here doesn't matter its best to try and organize
+ * it is such a way to pick up an already existing layout number
+ */
+ count = 0;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+ NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+ &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+ &rdev, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &zp->z_acl_cached->z_acl_count, 8);
+
+ if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+ zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+ locate.cb_aclp = zp->z_acl_cached;
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+ if (xattr)
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+
+ /*
+ * is it a symlink ?
+ *
+ * this will probably never be exercised since we won't
+ * have the cached ACL.
+ */
+ if (ZTOV(zp)->v_type == VLNK) {
+ slink = kmem_zalloc(zp->z_size + 1, KM_SLEEP);
+ if (zp->z_size + ZFS_OLD_ZNODE_PHYS_SIZE)
+ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ slink, zp->z_size);
+ else {
+ dmu_buf_t *dbp;
+ if (dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0,
+ FTAG, &dbp))
+ return;
+ bcopy(dbp->db_data, slink, zp->z_size);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SYMLINK(zfsvfs),
+ NULL, slink, zp->z_size);
+ }
+
+ /* if scanstamp then add scanstamp */
+
+ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+ bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ scanstamp, AV_SCANSTAMP_SZ);
+ SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+ NULL, scanstamp, AV_SCANSTAMP_SZ);
+ zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+ }
+
+ VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+ VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+ count, tx) == 0);
+ if (znode_acl.z_acl_extern_obj)
+ VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+ znode_acl.z_acl_extern_obj, tx));
+
+ zp->z_is_sa = B_TRUE;
+ if (slink)
+ kmem_free(slink, zp->z_size + 1);
+
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+ if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
+ return;
+
+ ASSERT(!zp->z_is_sa);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ if (ZFS_EXTERNAL_ACL(zp)) {
+ dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0,
+ DMU_OBJECT_END);
+ }
+}
+
+#endif
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c Tue Mar 16 09:43:38 2010 -0600
@@ -46,6 +46,7 @@
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <sys/varargs.h>
#include <sys/policy.h>
#include <sys/atomic.h>
@@ -60,6 +61,8 @@
#include <sys/dnlc.h>
#include <sys/dmu_objset.h>
#include <sys/spa_boot.h>
+#include <sys/sa.h>
+#include "zfs_comutil.h"
int zfsfstype;
vfsops_t *zfs_vfsops = NULL;
@@ -582,6 +585,7 @@
(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
err = zap_lookup(os, obj, buf, 8, 1, &used);
+
ASSERT(err == 0 || err == ENOENT);
/* no underflow/overflow */
ASSERT(delta > 0 || used >= -delta);
@@ -592,20 +596,38 @@
else
err = zap_update(os, obj, buf, 8, 1, &used, tx);
ASSERT(err == 0);
+
}
static int
-zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus,
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
uint64_t *userp, uint64_t *groupp)
{
- znode_phys_t *znp = bonus;
+ znode_phys_t *znp = data;
+ int error = 0;
- if (bonustype != DMU_OT_ZNODE)
+ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
return (ENOENT);
- *userp = znp->zp_uid;
- *groupp = znp->zp_gid;
- return (0);
+ if (bonustype == DMU_OT_ZNODE) {
+ *userp = znp->zp_uid;
+ *groupp = znp->zp_gid;
+ } else {
+ int hdrsize;
+
+ ASSERT(bonustype == DMU_OT_SA);
+ hdrsize = sa_hdrsize(data);
+
+ if (hdrsize != 0) {
+ *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_UID_OFFSET));
+ *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_GID_OFFSET));
+ } else {
+ error = ENOENT;
+ }
+ }
+ return (error);
}
static void
@@ -792,7 +814,7 @@
}
boolean_t
-zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
{
char buf[32];
uint64_t used, quota, usedobj, quotaobj;
@@ -815,6 +837,32 @@
return (used >= quota);
}
+boolean_t
+zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
+{
+ uint64_t fuid;
+ uint64_t quotaobj;
+ uid_t id;
+
+ quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+ id = isgroup ? zp->z_gid : zp->z_uid;
+
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ if (IS_EPHEMERAL(id)) {
+ VERIFY(0 == sa_lookup(zp->z_sa_hdl,
+ isgroup ? SA_ZPL_GID(zfsvfs) : SA_ZPL_UID(zfsvfs),
+ &fuid, sizeof (fuid)));
+ } else {
+ fuid = (uint64_t)id;
+ }
+
+ return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
+}
+
+
int
zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
{
@@ -822,6 +870,7 @@
zfsvfs_t *zfsvfs;
uint64_t zval;
int i, error;
+ uint64_t sa_obj;
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
@@ -879,6 +928,26 @@
zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error)
+ return (error);
+ } else {
+ /*
+ * Pre SA versions file systems should never touch
+ * either the attribute registration or layout objects.
+ */
+ sa_obj = 0;
+ }
+
+ zfsvfs->z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
@@ -1051,6 +1120,7 @@
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
}
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}
static int
@@ -1732,7 +1802,7 @@
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
zp = list_next(&zfsvfs->z_all_znodes, zp))
- if (zp->z_dbuf) {
+ if (zp->z_sa_hdl) {
ASSERT(ZTOV(zp)->v_count > 0);
zfs_znode_dmu_fini(zp);
}
@@ -1927,7 +1997,9 @@
ZFS_EXIT(zfsvfs);
return (err);
}
- zp_gen = zp->z_phys->zp_gen & gen_mask;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
if (zp_gen == 0)
zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) {
@@ -1966,7 +2038,7 @@
int
zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
{
- int err;
+ int err, err2;
ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
@@ -1977,6 +2049,17 @@
zfsvfs->z_os = NULL;
} else {
znode_t *zp;
+ uint64_t sa_obj = 0;
+
+ err2 = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj);
+
+ if ((err || err2) && zfsvfs->z_version >= ZPL_VERSION_SA)
+ goto bail;
+
+
+ zfsvfs->z_attr_table = sa_setup(zfsvfs->z_os, sa_obj,
+ zfs_attr_table, ZPL_END);
VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
@@ -1995,6 +2078,7 @@
}
+bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
@@ -2111,13 +2195,23 @@
if (newvers < zfsvfs->z_version)
return (EINVAL);
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (ENOTSUP);
+
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
return (error);
}
+
error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &newvers, tx);
@@ -2126,6 +2220,22 @@
return (error);
}
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT3U(error, ==, 0);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
spa_history_internal_log(LOG_DS_UPGRADE,
dmu_objset_spa(os), tx, CRED(),
"oldver=%llu newver=%llu dataset = %llu",
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Mar 16 09:43:38 2010 -0600
@@ -61,6 +61,7 @@
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/zap.h>
+#include <sys/sa.h>
#include <sys/dirent.h>
#include <sys/policy.h>
#include <sys/sunddi.h>
@@ -69,6 +70,7 @@
#include "fs/fs_subr.h"
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
+#include <sys/zfs_sa.h>
#include <sys/dnlc.h>
#include <sys/zfs_rlock.h>
#include <sys/extdirent.h>
@@ -176,7 +178,7 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
+ if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
((flag & FAPPEND) == 0)) {
ZFS_EXIT(zfsvfs);
return (EPERM);
@@ -184,8 +186,7 @@
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0) {
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
if (fs_vscan(*vpp, cr, 0) != 0) {
ZFS_EXIT(zfsvfs);
return (EACCES);
@@ -223,8 +224,7 @@
if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
ZTOV(zp)->v_type == VREG &&
- !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
- zp->z_phys->zp_size > 0)
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
VERIFY(fs_vscan(vp, cr, 1) == 0);
ZFS_EXIT(zfsvfs);
@@ -244,7 +244,7 @@
int error;
boolean_t hole;
- file_sz = zp->z_phys->zp_size;
+ file_sz = zp->z_size;
if (noff >= file_sz) {
return (ENXIO);
}
@@ -453,7 +453,7 @@
ZFS_VERIFY_ZP(zp);
os = zfsvfs->z_os;
- if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
ZFS_EXIT(zfsvfs);
return (EACCES);
}
@@ -477,7 +477,7 @@
/*
* Check for mandatory locks
*/
- if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+ if (MANDMODE(zp->z_mode)) {
if (error = chklock(vp, FREAD,
uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
ZFS_EXIT(zfsvfs);
@@ -500,13 +500,13 @@
* If we are reading past end-of-file we can skip
* to the end; but we might still need to set atime.
*/
- if (uio->uio_loffset >= zp->z_phys->zp_size) {
+ if (uio->uio_loffset >= zp->z_size) {
error = 0;
goto out;
}
- ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
- n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+ ASSERT(uio->uio_loffset < zp->z_size);
+ n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
if ((uio->uio_extflg == UIO_XUIO) &&
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
@@ -531,8 +531,8 @@
*/
while (--nblk >= 0) {
(void) dmu_xuio_add(xuio,
- dmu_request_arcbuf(zp->z_dbuf, blksz),
- 0, blksz);
+ dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz), 0, blksz);
}
}
}
@@ -580,6 +580,7 @@
* Timestamps:
* vp - ctime|mtime updated if byte count > 0
*/
+
/* ARGSUSED */
static int
zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -596,7 +597,6 @@
ssize_t n, nbytes;
rl_t *rl;
int max_blksz = zfsvfs->z_max_blksz;
- uint64_t pflags;
int error;
arc_buf_t *abuf;
iovec_t *aiov;
@@ -605,6 +605,9 @@
int iovcnt = uio->uio_iovcnt;
iovec_t *iovp = uio->uio_iov;
int write_eof;
+ int count = 0;
+ sa_bulk_attr_t bulk[4];
+ uint64_t mtime[2], ctime[2];
/*
* Fasttrack empty write
@@ -619,13 +622,19 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
/*
* If immutable or not appending then return EPERM
*/
- pflags = zp->z_phys->zp_flags;
- if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
- ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
- (uio->uio_loffset < zp->z_phys->zp_size))) {
+ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+ (uio->uio_loffset < zp->z_size))) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -635,7 +644,7 @@
/*
* Validate file offset
*/
- woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset;
+ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
if (woff < 0) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
@@ -645,7 +654,7 @@
* Check for mandatory locks before calling zfs_range_lock()
* in order to prevent a deadlock with locks set via fcntl().
*/
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+ if (MANDMODE((mode_t)zp->z_mode) &&
(error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
@@ -678,7 +687,7 @@
* the file block size to increase.
* Note that zp_size cannot change with this lock held.
*/
- woff = zp->z_phys->zp_size;
+ woff = zp->z_size;
}
uio->uio_loffset = woff;
} else {
@@ -700,9 +709,9 @@
n = limit - woff;
/* Will this write extend the file length? */
- write_eof = (woff + n > zp->z_phys->zp_size);
-
- end_size = MAX(zp->z_phys->zp_size, woff + n);
+ write_eof = (woff + n > zp->z_size);
+
+ end_size = MAX(zp->z_size, woff + n);
/*
* Write the file in reasonable size chunks. Each chunk is written
@@ -713,10 +722,8 @@
abuf = NULL;
woff = uio->uio_loffset;
again:
- if (zfs_usergroup_overquota(zfsvfs,
- B_FALSE, zp->z_phys->zp_uid) ||
- zfs_usergroup_overquota(zfsvfs,
- B_TRUE, zp->z_phys->zp_gid)) {
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
if (abuf != NULL)
dmu_return_arcbuf(abuf);
error = EDQUOT;
@@ -735,7 +742,7 @@
aiov->iov_len == arc_buf_size(abuf)));
i_iov++;
} else if (abuf == NULL && n >= max_blksz &&
- woff >= zp->z_phys->zp_size &&
+ woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
/*
@@ -747,7 +754,8 @@
*/
size_t cbytes;
- abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if (error = uiocopy(abuf->b_data, max_blksz,
@@ -762,8 +770,9 @@
* Start a transaction.
*/
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -825,7 +834,8 @@
xuio_stat_wbuf_copied();
} else {
ASSERT(xuio || tx_bytes == max_blksz);
- dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
+ woff, abuf, tx);
}
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
@@ -840,6 +850,8 @@
* partial progress, update the znode and ZIL accordingly.
*/
if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
dmu_tx_commit(tx);
ASSERT(error != 0);
break;
@@ -853,33 +865,35 @@
* been done, but that would still expose the ISUID/ISGID
* to another app after the partial write is committed.
*
- * Note: we don't call zfs_fuid_map_id() here because
- * user 0 is not an ephemeral uid.
*/
mutex_enter(&zp->z_acl_lock);
- if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
(S_IXUSR >> 6))) != 0 &&
- (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
secpolicy_vnode_setid_retain(cr,
- (zp->z_phys->zp_mode & S_ISUID) != 0 &&
- zp->z_phys->zp_uid == 0) != 0) {
- zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+ (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
}
mutex_exit(&zp->z_acl_lock);
- /*
- * Update time stamp. NOTE: This marks the bonus buffer as
- * dirty, so we don't have to do it again for zp_size.
- */
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
/*
* Update the file size (zp_size) if it has changed;
* account for possible concurrent updates.
*/
- while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
- (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+ while ((end_size = zp->z_size) < uio->uio_loffset) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
uio->uio_loffset);
+ ASSERT(error == 0);
+ }
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
@@ -983,7 +997,7 @@
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
- if (offset >= zp->z_phys->zp_size) {
+ if (offset >= zp->z_size) {
error = ENOENT;
} else {
error = dmu_read(os, object, offset, size, buf,
@@ -1010,7 +1024,7 @@
zfs_range_unlock(zgd->zgd_rl);
}
/* test for truncation needs to be done while range locked */
- if (lr->lr_offset >= zp->z_phys->zp_size)
+ if (lr->lr_offset >= zp->z_size)
error = ENOENT;
#ifdef DEBUG
if (zil_fault_io) {
@@ -1132,7 +1146,7 @@
if (dvp->v_type != VDIR) {
return (ENOTDIR);
- } else if (zdp->z_dbuf == NULL) {
+ } else if (zdp->z_sa_hdl == NULL) {
return (EIO);
}
@@ -1184,7 +1198,7 @@
* We don't allow recursive attributes..
* Maybe someday we will.
*/
- if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+ if (zdp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -1277,7 +1291,7 @@
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
- zfs_acl_ids_t acl_ids;
+ zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
/*
@@ -1344,6 +1358,7 @@
return (error);
}
}
+
if (zp == NULL) {
uint64_t txtype;
@@ -1359,7 +1374,8 @@
* We only support the creation of regular files in
* extended attribute directories.
*/
- if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+
+ if ((dzp->z_pflags & ZFS_XATTR) &&
(vap->va_type != VREG)) {
error = EINVAL;
goto out;
@@ -1375,15 +1391,19 @@
}
tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
+ 0, acl_ids.z_aclp->z_acl_bytes);
}
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
@@ -1398,13 +1418,12 @@
ZFS_EXIT(zfsvfs);
return (error);
}
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
(void) zfs_link_create(dl, zp, tx, ZNEW);
-
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
if (flag & FIGNORECASE)
txtype |= TX_CI;
@@ -1490,6 +1509,9 @@
* dvp - ctime|mtime
* vp - ctime (if nlink > 0)
*/
+
+uint64_t null_xattr = 0;
+
/*ARGSUSED*/
static int
zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
@@ -1500,7 +1522,8 @@
vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- uint64_t acl_obj, xattr_obj;
+ uint64_t acl_obj, xattr_obj = 0;
+ uint64_t xattr_obj_unlinked = 0;
zfs_dirlock_t *dl;
dmu_tx_t *tx;
boolean_t may_delete_now, delete_now = FALSE;
@@ -1566,24 +1589,29 @@
*/
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
if (may_delete_now) {
toobig =
- zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
+ zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
/* if the file is too big, only hold_free a token amount */
dmu_tx_hold_free(tx, zp->z_id, 0,
(toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
}
/* are there any extended attributes? */
- if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
- /* XXX - do we need this if we are deleting? */
- dmu_tx_hold_bonus(tx, xattr_obj);
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT3U(error, ==, 0);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
/* are there any additional acls */
- if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
- may_delete_now)
+ if ((acl_obj = ZFS_EXTERNAL_ACL(zp)) != 0 && may_delete_now)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
/* charge as an update -- would be nice not to charge at all */
@@ -1616,26 +1644,37 @@
}
if (unlinked) {
+
mutex_enter(&vp->v_lock);
+
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
delete_now = may_delete_now && !toobig &&
vp->v_count == 1 && !vn_has_cached_data(vp) &&
- zp->z_phys->zp_xattr == xattr_obj &&
- zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
+ xattr_obj == xattr_obj_unlinked && ZFS_EXTERNAL_ACL(zp) ==
+ acl_obj;
mutex_exit(&vp->v_lock);
}
if (delete_now) {
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
- ASSERT3U(error, ==, 0);
- ASSERT3U(xzp->z_phys->zp_links, ==, 2);
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ if (xattr_obj_unlinked) {
+ ASSERT3U(xzp->z_links, ==, 2);
mutex_enter(&xzp->z_lock);
xzp->z_unlinked = 1;
- xzp->z_phys->zp_links = 0;
+ xzp->z_links = 0;
+ error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &xzp->z_links, sizeof (xzp->z_links), tx);
+ ASSERT3U(error, ==, 0);
mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
- zp->z_phys->zp_xattr = 0; /* probably unnecessary */
+ if (zp->z_is_sa)
+ error = sa_remove(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), tx);
+ else
+ error = sa_update(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), &null_xattr,
+ sizeof (uint64_t), tx);
+ ASSERT3U(error, ==, 0);
}
mutex_enter(&zp->z_lock);
mutex_enter(&vp->v_lock);
@@ -1707,7 +1746,7 @@
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
- zfs_acl_ids_t acl_ids;
+ zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
ASSERT(vap->va_type == VDIR);
@@ -1731,7 +1770,7 @@
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
- if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+ if (dzp->z_pflags & ZFS_XATTR) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -1791,9 +1830,14 @@
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
zfs_acl_ids_free(&acl_ids);
@@ -1811,10 +1855,11 @@
/*
* Create new node.
*/
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
+
/*
* Now put new name in parent dir.
*/
@@ -1829,6 +1874,7 @@
acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
+
dmu_tx_commit(tx);
zfs_dirent_unlock(dl);
@@ -1920,8 +1966,10 @@
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
rw_exit(&zp->z_parent_lock);
@@ -2003,6 +2051,7 @@
zap_attribute_t zap;
uint_t bytes_wanted;
uint64_t offset; /* must be unsigned; checks for < 1 */
+ uint64_t parent;
int local_eof;
int outcount;
int error;
@@ -2012,6 +2061,12 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
/*
* If we are not given an eof variable,
* use a local one.
@@ -2099,7 +2154,7 @@
} else if (offset == 1) {
(void) strcpy(zap.za_name, "..");
zap.za_normalization_conflict = 0;
- objnum = zp->z_phys->zp_parent;
+ objnum = parent;
} else if (offset == 2 && zfs_show_ctldir(zp)) {
(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
zap.za_normalization_conflict = 0;
@@ -2293,24 +2348,32 @@
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_phys_t *pzp;
int error = 0;
uint64_t links;
+ uint64_t mtime[2], ctime[2];
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- pzp = zp->z_phys;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
/*
* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
* Also, if we are the owner don't bother, since owner should
* always be allowed to read basic attributes of file.
*/
- if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
- (pzp->zp_uid != crgetuid(cr))) {
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (zp->z_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) {
ZFS_EXIT(zfsvfs);
@@ -2325,16 +2388,17 @@
mutex_enter(&zp->z_lock);
vap->va_type = vp->v_type;
- vap->va_mode = pzp->zp_mode & MODEMASK;
- zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+ vap->va_mode = zp->z_mode & MODEMASK;
+ vap->va_uid = zp->z_uid;
+ vap->va_gid = zp->z_gid;
vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
vap->va_nodeid = zp->z_id;
if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
- links = pzp->zp_links + 1;
+ links = zp->z_links + 1;
else
- links = pzp->zp_links;
+ links = zp->z_links;
vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
- vap->va_size = pzp->zp_size;
+ vap->va_size = zp->z_size;
vap->va_rdev = vp->v_rdev;
vap->va_seq = zp->z_seq;
@@ -2345,115 +2409,97 @@
if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
xoap->xoa_archive =
- ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
+ ((zp->z_pflags & ZFS_ARCHIVE) != 0);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
xoap->xoa_readonly =
- ((pzp->zp_flags & ZFS_READONLY) != 0);
+ ((zp->z_pflags & ZFS_READONLY) != 0);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
xoap->xoa_system =
- ((pzp->zp_flags & ZFS_SYSTEM) != 0);
+ ((zp->z_pflags & ZFS_SYSTEM) != 0);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
xoap->xoa_hidden =
- ((pzp->zp_flags & ZFS_HIDDEN) != 0);
+ ((zp->z_pflags & ZFS_HIDDEN) != 0);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
xoap->xoa_nounlink =
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
xoap->xoa_immutable =
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
xoap->xoa_appendonly =
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
xoap->xoa_nodump =
- ((pzp->zp_flags & ZFS_NODUMP) != 0);
+ ((zp->z_pflags & ZFS_NODUMP) != 0);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
xoap->xoa_opaque =
- ((pzp->zp_flags & ZFS_OPAQUE) != 0);
+ ((zp->z_pflags & ZFS_OPAQUE) != 0);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
xoap->xoa_av_quarantined =
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
xoap->xoa_av_modified =
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
- vp->v_type == VREG &&
- (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
- /*
- * Only VREG files have anti-virus scanstamps, so we
- * won't conflict with symlinks in the bonus buffer.
- */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len <= doi.doi_bonus_size) {
- /*
- * pzp points to the start of the
- * znode_phys_t. pzp + 1 points to the
- * first byte after the znode_phys_t.
- */
- (void) memcpy(xoap->xoa_av_scanstamp,
- pzp + 1,
- sizeof (xoap->xoa_av_scanstamp));
- XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
- }
+ vp->v_type == VREG) {
+ zfs_sa_get_scanstamp(zp, xvap);
}
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
+ uint64_t times[2];
+
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ times, sizeof (times));
+ ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
XVA_SET_RTN(xvap, XAT_CREATETIME);
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
- xoap->xoa_reparse =
- ((pzp->zp_flags & ZFS_REPARSE) != 0);
+ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
XVA_SET_RTN(xvap, XAT_REPARSE);
}
}
- ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
- ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
- ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+ ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, ctime);
mutex_exit(&zp->z_lock);
- dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
+ sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
if (zp->z_blksz == 0) {
/*
@@ -2490,7 +2536,6 @@
caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
- znode_phys_t *pzp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog;
dmu_tx_t *tx;
@@ -2501,15 +2546,19 @@
int trim_mask = 0;
uint64_t new_mode;
uint64_t new_uid, new_gid;
+ uint64_t xattr_obj = 0;
+ uint64_t mtime[2], ctime[2];
znode_t *attrzp;
int need_policy = FALSE;
- int err;
+ int err, err2;
zfs_fuid_info_t *fuidp = NULL;
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap;
zfs_acl_t *aclp = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- boolean_t fuid_dirtied = B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ sa_bulk_attr_t bulk[7], xattr_bulk[7];
+ int count = 0, xattr_count = 0;
if (mask == 0)
return (0);
@@ -2520,7 +2569,6 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- pzp = zp->z_phys;
zilog = zfsvfs->z_log;
/*
@@ -2557,14 +2605,14 @@
/*
* Immutable files can only alter immutable bit and atime
*/
- if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
- if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
+ if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
@@ -2621,9 +2669,10 @@
XVA_ISSET_REQ(xvap, XAT_READONLY) ||
XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
- XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
skipaclchk, cr);
+ }
if (mask & (AT_UID|AT_GID)) {
int idmask = (mask & (AT_UID|AT_GID));
@@ -2636,7 +2685,7 @@
*/
if (!(mask & AT_MODE))
- vap->va_mode = pzp->zp_mode;
+ vap->va_mode = zp->z_mode;
/*
* Take ownership or chgrp to group we are a member of
@@ -2674,8 +2723,9 @@
}
mutex_enter(&zp->z_lock);
- oldva.va_mode = pzp->zp_mode;
- zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+ oldva.va_mode = zp->z_mode;
+ oldva.va_uid = zp->z_uid;
+ oldva.va_gid = zp->z_gid;
if (mask & AT_XVATTR) {
/*
* Update xvattr mask to include only those attributes
@@ -2686,7 +2736,7 @@
*/
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
if (xoap->xoa_appendonly !=
- ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_APPENDONLY);
@@ -2696,7 +2746,7 @@
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
if (xoap->xoa_nounlink !=
- ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
@@ -2706,7 +2756,7 @@
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
if (xoap->xoa_immutable !=
- ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
@@ -2716,7 +2766,7 @@
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
if (xoap->xoa_nodump !=
- ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_NODUMP);
@@ -2726,7 +2776,7 @@
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
if (xoap->xoa_av_modified !=
- ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
@@ -2738,7 +2788,7 @@
if ((vp->v_type != VREG &&
xoap->xoa_av_quarantined) ||
xoap->xoa_av_quarantined !=
- ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
need_policy = TRUE;
} else {
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
@@ -2805,78 +2855,83 @@
*/
mask = vap->va_mask;
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (mask & AT_MODE) {
- uint64_t pmode = pzp->zp_mode;
-
- new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
- if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
- goto out;
- if (pzp->zp_acl.z_acl_extern_obj) {
- /* Are we upgrading ACL from old V0 format to new V1 */
- if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
- pzp->zp_acl.z_acl_version ==
- ZFS_ACL_VERSION_INITIAL) {
- dmu_tx_hold_free(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- DMU_OBJECT_END);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- } else {
- dmu_tx_hold_write(tx,
- pzp->zp_acl.z_acl_extern_obj, 0,
- aclp->z_acl_bytes);
- }
- } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, aclp->z_acl_bytes);
- }
- }
-
- if (mask & (AT_UID | AT_GID)) {
- if (pzp->zp_xattr) {
- err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
+ if ((mask & (AT_UID | AT_GID))) {
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj,
+ sizeof (xattr_obj));
+
+ if (xattr_obj) {
+ err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
if (err)
- goto out;
- dmu_tx_hold_bonus(tx, attrzp->z_id);
+ goto out2;
}
if (mask & AT_UID) {
new_uid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
- if (new_uid != pzp->zp_uid &&
- zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
+ if (vap->va_uid != zp->z_uid &&
+ zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
err = EDQUOT;
- goto out;
+ goto out2;
}
}
if (mask & AT_GID) {
new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
cr, ZFS_GROUP, &fuidp);
- if (new_gid != pzp->zp_gid &&
- zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
+ if (new_gid != zp->z_gid &&
+ zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
err = EDQUOT;
- goto out;
- }
- }
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied) {
- if (zfsvfs->z_fuid_obj == 0) {
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
- dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
- FALSE, NULL);
- } else {
- dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
- dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
- FUID_SIZE_ESTIMATE(zfsvfs));
+ goto out2;
}
}
}
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = zp->z_mode;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
+ goto out;
+
+ if (!zp->z_is_sa && ZFS_EXTERNAL_ACL(zp)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
+ ZNODE_ACL_VERSION(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx,
+ ZFS_EXTERNAL_ACL(zp), 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), 0,
+ aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ }
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if ((mask & AT_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ }
+
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err) {
@@ -2885,8 +2940,7 @@
goto out;
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
+ count = 0;
/*
* Set each attribute requested.
* We group settings according to the locks they need to acquire.
@@ -2897,9 +2951,38 @@
mutex_enter(&zp->z_lock);
+ if (attrzp)
+ mutex_enter(&attrzp->z_lock);
+
+ if (mask & AT_UID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ zp->z_uid = zfs_fuid_map_id(zfsvfs, new_uid, cr, ZFS_OWNER);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ attrzp->z_gid = zp->z_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &new_gid, sizeof (new_gid));
+ zp->z_gid = zfs_fuid_map_id(zfsvfs, new_gid, cr, ZFS_GROUP);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ attrzp->z_gid = zp->z_gid;
+ }
+ }
+
if (mask & AT_MODE) {
mutex_enter(&zp->z_acl_lock);
- zp->z_phys->zp_mode = new_mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = new_mode;
err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT3U(err, ==, 0);
zp->z_acl_cached = aclp;
@@ -2908,34 +2991,42 @@
}
if (attrzp)
- mutex_enter(&attrzp->z_lock);
-
- if (mask & AT_UID) {
- pzp->zp_uid = new_uid;
- if (attrzp)
- attrzp->z_phys->zp_uid = new_uid;
+ mutex_exit(&attrzp->z_lock);
+
+ if (mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
}
- if (mask & AT_GID) {
- pzp->zp_gid = new_gid;
- if (attrzp)
- attrzp->z_phys->zp_gid = new_gid;
+ if (mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
}
- if (attrzp)
- mutex_exit(&attrzp->z_lock);
-
- if (mask & AT_ATIME)
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
- if (mask & AT_MTIME)
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
- if (mask & AT_SIZE)
- zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
- else if (mask != 0)
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+ if (!(mask & AT_MTIME))
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ } else if (mask != 0) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+ B_TRUE);
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+ mtime, ctime, B_TRUE);
+ }
+ }
/*
* Do this after setting timestamps to prevent timestamp
* update from toggling bit
@@ -2967,20 +3058,12 @@
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
}
- if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- size_t len;
- dmu_object_info_t doi;
-
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
ASSERT(vp->v_type == VREG);
- /* Grow the bonus buffer if necessary. */
- dmu_object_info_from_db(zp->z_dbuf, &doi);
- len = sizeof (xoap->xoa_av_scanstamp) +
- sizeof (znode_phys_t);
- if (len > doi.doi_bonus_size)
- VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
- }
- zfs_xvattr_set(zp, xvap);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_xvattr_set(zp, xvap, tx);
}
if (fuid_dirtied)
@@ -2992,9 +3075,14 @@
mutex_exit(&zp->z_lock);
out:
+ if (err == 0 && attrzp) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
if (attrzp)
VN_RELE(ZTOV(attrzp));
-
if (aclp)
zfs_acl_free(aclp);
@@ -3003,14 +3091,17 @@
fuidp = NULL;
}
- if (err)
+ if (err) {
dmu_tx_abort(tx);
- else
+ if (err == ERESTART)
+ goto top;
+ } else {
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
-
- if (err == ERESTART)
- goto top;
-
+ }
+
+
+out2:
ZFS_EXIT(zfsvfs);
return (err);
}
@@ -3050,7 +3141,7 @@
zfs_zlock_t *zl;
znode_t *zp = tdzp;
uint64_t rootid = zp->z_zfsvfs->z_root;
- uint64_t *oidp = &zp->z_id;
+ uint64_t oidp = zp->z_id;
krwlock_t *rwlp = &szp->z_parent_lock;
krw_t rw = RW_WRITER;
@@ -3072,7 +3163,7 @@
zfs_rename_unlock(&zl);
*zlpp = NULL;
zp = tdzp;
- oidp = &zp->z_id;
+ oidp = zp->z_id;
rwlp = &szp->z_parent_lock;
rw = RW_WRITER;
continue;
@@ -3090,19 +3181,20 @@
zl->zl_next = *zlpp;
*zlpp = zl;
- if (*oidp == szp->z_id) /* We're a descendant of szp */
+ if (oidp == szp->z_id) /* We're a descendant of szp */
return (EINVAL);
- if (*oidp == rootid) /* We've hit the top */
+ if (oidp == rootid) /* We've hit the top */
return (0);
if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+ int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
if (error)
return (error);
zl->zl_znode = zp;
}
- oidp = &zp->z_phys->zp_parent;
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
+ &oidp, sizeof (oidp));
rwlp = &zp->z_parent_lock;
rw = RW_READER;
@@ -3182,8 +3274,7 @@
* by renaming a linked file into/outof an attribute directory.
* See the comment in zfs_link() for why this is considered bad.
*/
- if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
- (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -3363,14 +3454,20 @@
}
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
- dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
- if (sdzp != tdzp)
- dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
- if (tzp)
- dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
@@ -3401,10 +3498,14 @@
if (error == 0) {
error = zfs_link_create(tdl, szp, tx, ZRENAMING);
if (error == 0) {
- szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
+ szp->z_pflags |= ZFS_AV_MODIFIED;
+
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT3U(error, ==, 0);
error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
- ASSERT(error == 0);
+ ASSERT3U(error, ==, 0);
zfs_log_rename(zilog, tx,
TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
@@ -3462,11 +3563,12 @@
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- int len = strlen(link);
+ uint64_t len = strlen(link);
int error;
int zflg = ZNEW;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
ASSERT(vap->va_type == VLNK);
@@ -3511,10 +3613,14 @@
tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
- dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
error = dmu_tx_assign(tx, TXG_NOWAIT);
@@ -3531,50 +3637,33 @@
return (error);
}
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
/*
* Create a new object for the symlink.
- * Put the link content into bonus buffer if it will fit;
- * otherwise, store it just like any other file data.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
*/
- if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
- if (len != 0)
- bcopy(link, zp->z_phys + 1, len);
- } else {
- dmu_buf_t *dbp;
-
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
- /*
- * Nothing can access the znode yet so no locking needed
- * for growing the znode's blocksize.
- */
- zfs_grow_blocksize(zp, len, tx);
-
- VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
- zp->z_id, 0, FTAG, &dbp));
- dmu_buf_will_dirty(dbp, tx);
-
- ASSERT3U(len, <=, dbp->db_size);
- bcopy(link, dbp->db_data, len);
- dmu_buf_rele(dbp, FTAG);
- }
- zp->z_phys->zp_size = len;
-
+
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ link, len, tx);
+ else
+ zfs_sa_symlink(zp, link, len, tx);
+
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
/*
* Insert the new object into the directory.
*/
(void) zfs_link_create(dl, zp, tx, ZNEW);
- if (error == 0) {
- uint64_t txtype = TX_SYMLINK;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
- }
+
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
zfs_acl_ids_free(&acl_ids);
@@ -3611,29 +3700,19 @@
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- size_t bufsz;
int error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- bufsz = (size_t)zp->z_phys->zp_size;
- if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
- error = uiomove(zp->z_phys + 1,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- } else {
- dmu_buf_t *dbp;
- error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- error = uiomove(dbp->db_data,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- dmu_buf_rele(dbp, FTAG);
- }
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -3668,7 +3747,6 @@
vnode_t *realvp;
int error;
int zf = ZNEW;
- uid_t owner;
ASSERT(tdvp->v_type == VDIR);
@@ -3701,8 +3779,7 @@
* into "normal" file space in order to circumvent restrictions
* imposed in attribute space.
*/
- if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
- (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
ZFS_EXIT(zfsvfs);
return (EINVAL);
}
@@ -3716,8 +3793,7 @@
return (EPERM);
}
- owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
- if (owner != crgetuid(cr) &&
+ if (szp->z_uid != crgetuid(cr) &&
secpolicy_basic_link(cr) != 0) {
ZFS_EXIT(zfsvfs);
return (EPERM);
@@ -3738,8 +3814,10 @@
}
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, dzp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
@@ -3815,10 +3893,8 @@
dmu_tx_t *tx;
u_offset_t off, koff;
size_t len, klen;
- uint64_t filesz;
int err;
- filesz = zp->z_phys->zp_size;
off = pp->p_offset;
len = PAGESIZE;
/*
@@ -3826,12 +3902,12 @@
* multiple pages so that we write a full block (thus avoiding
* a read-modify-write).
*/
- if (off < filesz && zp->z_blksz > PAGESIZE) {
+ if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
- ASSERT(koff <= filesz);
- if (koff + klen > filesz)
- klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE);
+ ASSERT(koff <= zp->z_size);
+ if (koff + klen > zp->z_size)
+ klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
}
ASSERT3U(btop(len), ==, btopr(len));
@@ -3839,30 +3915,32 @@
/*
* Can't push pages past end-of-file.
*/
- if (off >= filesz) {
+ if (off >= zp->z_size) {
/* ignore all pages */
err = 0;
goto out;
- } else if (off + len > filesz) {
- int npages = btopr(filesz - off);
+ } else if (off + len > zp->z_size) {
+ int npages = btopr(zp->z_size - off);
page_t *trunc;
page_list_break(&pp, &trunc, npages);
/* ignore pages past end of file */
if (trunc)
pvn_write_done(trunc, flags);
- len = filesz - off;
+ len = zp->z_size - off;
}
- if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) ||
- zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) {
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
err = EDQUOT;
goto out;
}
top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len);
- dmu_tx_hold_bonus(tx, zp->z_id);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err != 0) {
if (err == ERESTART) {
@@ -3884,7 +3962,16 @@
}
if (err == 0) {
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
}
dmu_tx_commit(tx);
@@ -3960,14 +4047,14 @@
}
rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
- if (off > zp->z_phys->zp_size) {
+ if (off > zp->z_size) {
/* past end of file */
zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (0);
}
- len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off);
+ len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
for (off = io_off; io_off < off + len; io_off += io_len) {
if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
@@ -4008,7 +4095,7 @@
int error;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
- if (zp->z_dbuf == NULL) {
+ if (zp->z_sa_hdl == NULL) {
/*
* The fs has been unmounted, or we did a
* suspend/resume and this file no longer exists.
@@ -4041,13 +4128,15 @@
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
mutex_enter(&zp->z_lock);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
zp->z_atime_dirty = 0;
mutex_exit(&zp->z_lock);
dmu_tx_commit(tx);
@@ -4099,7 +4188,7 @@
* return an error, but we don't worry about races between this
* function and zfs_map().
*/
- if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+ if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
ZFS_EXIT(zfsvfs);
return (EAGAIN);
}
@@ -4312,15 +4401,14 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- if ((prot & PROT_WRITE) &&
- (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY |
- ZFS_APPENDONLY))) {
+ if ((prot & PROT_WRITE) && (zp->z_pflags &
+ (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
ZFS_EXIT(zfsvfs);
return (EPERM);
}
if ((prot & (PROT_READ | PROT_EXEC)) &&
- (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) {
+ (zp->z_pflags & ZFS_AV_QUARANTINED)) {
ZFS_EXIT(zfsvfs);
return (EACCES);
}
@@ -4343,7 +4431,7 @@
/*
* If file is locked, disallow mapping.
*/
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
+ if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
ZFS_EXIT(zfsvfs);
return (EAGAIN);
}
@@ -4489,13 +4577,19 @@
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
uint32_t gen;
+ uint64_t gen64;
uint64_t object = zp->z_id;
zfid_short_t *zfid;
- int size, i;
+ int size, i, error;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- gen = (uint32_t)zp->z_gen;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0)
+ return (error);
+
+ gen = (uint32_t)gen64;
size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
if (fidp->fid_len < size) {
@@ -4713,21 +4807,24 @@
*/
if (preamble) {
/* data begins in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
ASSERT(abuf);
(void) dmu_xuio_add(xuio, abuf,
blksz - preamble, preamble);
}
for (i = 0; i < fullblk; i++) {
- abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
ASSERT(abuf);
(void) dmu_xuio_add(xuio, abuf, 0, blksz);
}
if (postamble) {
/* data ends in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
ASSERT(abuf);
(void) dmu_xuio_add(xuio, abuf, 0, postamble);
}
@@ -4749,7 +4846,7 @@
return (EINVAL);
}
- maxsize = zp->z_phys->zp_size - uio->uio_loffset;
+ maxsize = zp->z_size - uio->uio_loffset;
if (size > maxsize)
size = maxsize;
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Mar 16 09:43:38 2010 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -52,6 +52,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zfs_rlock.h>
#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
#include <sys/fs/zfs.h>
#include <sys/kidmap.h>
#endif /* _KERNEL */
@@ -61,8 +62,11 @@
#include <sys/stat.h>
#include <sys/zap.h>
#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
#include "zfs_prop.h"
+#include "zfs_comutil.h"
/*
* Define ZNODE_STATS to turn on statistic gathering. By default, it is only
@@ -131,7 +135,6 @@
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
- zp->z_dbuf = NULL;
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
return (0);
@@ -154,7 +157,6 @@
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);
- ASSERT(zp->z_dbuf == NULL);
ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
}
@@ -198,8 +200,15 @@
nzp->z_last_itx = ozp->z_last_itx;
nzp->z_gen = ozp->z_gen;
nzp->z_sync_cnt = ozp->z_sync_cnt;
- nzp->z_phys = ozp->z_phys;
- nzp->z_dbuf = ozp->z_dbuf;
+ nzp->z_is_sa = ozp->z_is_sa;
+ nzp->z_sa_hdl = ozp->z_sa_hdl;
+ bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
+ nzp->z_links = ozp->z_links;
+ nzp->z_size = ozp->z_size;
+ nzp->z_pflags = ozp->z_pflags;
+ nzp->z_uid = ozp->z_uid;
+ nzp->z_gid = ozp->z_gid;
+ nzp->z_mode = ozp->z_mode;
/*
* Since this is just an idle znode and kmem is already dealing with
@@ -210,9 +219,7 @@
ozp->z_acl_cached = NULL;
}
- /* Update back pointers. */
- (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
- znode_evict_error);
+ sa_set_userp(nzp->z_sa_hdl, nzp);
/*
* Invalidate the original znode by clearing fields that provide a
@@ -220,7 +227,7 @@
* ensure that zfs_znode_move() recognizes the znode as invalid in any
* subsequent callback.
*/
- ozp->z_dbuf = NULL;
+ ozp->z_sa_hdl = NULL;
POINTER_INVALIDATE(&ozp->z_zfsvfs);
}
@@ -475,6 +482,7 @@
sharezp->z_unlinked = 0;
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
+ sharezp->z_is_sa = zfsvfs->z_use_sa;
vp = ZTOV(sharezp);
vn_reinit(vp);
@@ -482,8 +490,7 @@
VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
kcred, NULL, &acl_ids));
- zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
- &zp, 0, &acl_ids);
+ zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, sharezp);
ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
POINTER_INVALIDATE(&sharezp->z_zfsvfs);
@@ -493,8 +500,7 @@
zfs_acl_ids_free(&acl_ids);
ZTOV(sharezp)->v_count = 0;
- dmu_buf_rele(sharezp->z_dbuf, NULL);
- sharezp->z_dbuf = NULL;
+ sa_handle_destroy(sharezp->z_sa_hdl);
kmem_cache_free(znode_cache, sharezp);
return (error);
@@ -558,26 +564,25 @@
}
static void
-zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
{
- znode_t *nzp;
-
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
mutex_enter(&zp->z_lock);
- ASSERT(zp->z_dbuf == NULL);
+ ASSERT(zp->z_sa_hdl == NULL);
ASSERT(zp->z_acl_cached == NULL);
- zp->z_dbuf = db;
- nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
- /*
- * there should be no
- * concurrent zgets on this object.
- */
- if (nzp != NULL)
- panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
/*
* Slap on VROOT if we are the root znode
@@ -592,14 +597,12 @@
void
zfs_znode_dmu_fini(znode_t *zp)
{
- dmu_buf_t *db = zp->z_dbuf;
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
zp->z_unlinked ||
RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
- ASSERT(zp->z_dbuf != NULL);
- zp->z_dbuf = NULL;
- VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
- dmu_buf_rele(db, NULL);
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
}
/*
@@ -610,22 +613,27 @@
* return the znode
*/
static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
{
znode_t *zp;
vnode_t *vp;
+ uint64_t mode;
+ uint64_t parent;
+ uint64_t uid, gid;
+ sa_bulk_attr_t bulk[9];
+ int count = 0;
zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
ASSERT(zp->z_dirlocks == NULL);
- ASSERT(zp->z_dbuf == NULL);
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
/*
* Defer setting z_zfsvfs until the znode is ready to be a candidate for
* the zfs_znode_move() callback.
*/
- zp->z_phys = NULL;
+ zp->z_sa_hdl = NULL;
zp->z_unlinked = 0;
zp->z_atime_dirty = 0;
zp->z_mapcnt = 0;
@@ -638,16 +646,41 @@
vp = ZTOV(zp);
vn_reinit(vp);
- zfs_znode_dmu_init(zfsvfs, zp, db);
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
- zp->z_gen = zp->z_phys->zp_gen;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &gid, 8);
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ kmem_cache_free(znode_cache, zp);
+ return (NULL);
+ }
+
+ zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
+ zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
+ zp->z_mode = mode;
vp->v_vfsp = zfsvfs->z_parent->z_vfs;
- vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+
+ vp->v_type = IFTOVT((mode_t)mode);
switch (vp->v_type) {
case VDIR:
- if (zp->z_phys->zp_flags & ZFS_XATTR) {
+ if (zp->z_pflags & ZFS_XATTR) {
vn_setops(vp, zfs_xdvnodeops);
vp->v_flag |= V_XATTRDIR;
} else {
@@ -657,7 +690,13 @@
break;
case VBLK:
case VCHR:
- vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
+ {
+ uint64_t rdev;
+ VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
+ &rdev, sizeof (rdev)) == 0);
+
+ vp->v_rdev = zfs_cmpldev(rdev);
+ }
/*FALLTHROUGH*/
case VFIFO:
case VSOCK:
@@ -666,10 +705,12 @@
break;
case VREG:
vp->v_flag |= VMODSORT;
- if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir)
+ if (parent == zfsvfs->z_shares_dir) {
+ ASSERT(uid == 0 && gid == 0);
vn_setops(vp, zfs_sharevnodeops);
- else
+ } else {
vn_setops(vp, zfs_fvnodeops);
+ }
break;
case VLNK:
vn_setops(vp, zfs_symvnodeops);
@@ -693,6 +734,9 @@
return (zp);
}
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
/*
* Create a new DMU object to hold a zfs znode.
*
@@ -712,14 +756,23 @@
*/
void
zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
- uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
{
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t dzp_pflags = 0;
+ uint64_t rdev = 0;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
dmu_buf_t *db;
- znode_phys_t *pzp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
timestruc_t now;
uint64_t gen, obj;
int err;
+ int bonuslen;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t sa_attrs[ZPL_END];
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
@@ -733,6 +786,10 @@
gen = dmu_tx_get_txg(tx);
}
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+
/*
* Create a new DMU object.
*/
@@ -746,106 +803,211 @@
if (zfsvfs->z_replay) {
err = zap_create_claim_norm(zfsvfs->z_os, obj,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
ASSERT3U(err, ==, 0);
} else {
obj = zap_create_norm(zfsvfs->z_os,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
}
} else {
if (zfsvfs->z_replay) {
err = dmu_object_claim(zfsvfs->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
ASSERT3U(err, ==, 0);
} else {
obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ obj_type, bonuslen, tx);
}
}
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
- dmu_buf_will_dirty(db, tx);
-
- /*
- * Initialize the znode physical data to zero.
- */
- ASSERT(db->db_size >= sizeof (znode_phys_t));
- bzero(db->db_data, db->db_size);
- pzp = db->db_data;
+ VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
/*
* If this is the root, fix up the half-initialized parent pointer
* to reference the just-allocated physical data area.
*/
if (flag & IS_ROOT_NODE) {
- dzp->z_dbuf = db;
- dzp->z_phys = pzp;
dzp->z_id = obj;
+ } else {
+ dzp_pflags = dzp->z_pflags;
}
/*
* If parent is an xattr, so am I.
*/
- if (dzp->z_phys->zp_flags & ZFS_XATTR)
+ if (dzp_pflags & ZFS_XATTR) {
flag |= IS_XATTR;
-
- if (vap->va_type == VBLK || vap->va_type == VCHR) {
- pzp->zp_rdev = zfs_expldev(vap->va_rdev);
}
if (zfsvfs->z_use_fuids)
- pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
if (vap->va_type == VDIR) {
- pzp->zp_size = 2; /* contents ("." and "..") */
- pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ size = 2; /* contents ("." and "..") */
+ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ } else {
+ size = links = 0;
+ }
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ rdev = zfs_expldev(vap->va_rdev);
}
- pzp->zp_parent = dzp->z_id;
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
if (flag & IS_XATTR)
- pzp->zp_flags |= ZFS_XATTR;
+ pflags |= ZFS_XATTR;
- pzp->zp_gen = gen;
+ /*
+ * No execs denied will be deterimed when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
- ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
- ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
if (vap->va_mask & AT_ATIME) {
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
} else {
- ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+ ZFS_TIME_ENCODE(&now, atime);
}
if (vap->va_mask & AT_MTIME) {
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
} else {
- ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+ ZFS_TIME_ENCODE(&now, mtime);
}
- pzp->zp_uid = acl_ids->z_fuid;
- pzp->zp_gid = acl_ids->z_fgid;
- pzp->zp_mode = acl_ids->z_mode;
+
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (vap->va_type == VBLK || vap->va_type == VCHR)) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
if (!(flag & IS_ROOT_NODE)) {
- *zpp = zfs_znode_alloc(zfsvfs, db, 0);
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ ASSERT(*zpp != NULL);
} else {
/*
* If we are creating the root node, the "parent" we
* passed in is the znode for the root.
*/
*zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
}
- VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+
if (vap->va_mask & AT_XVATTR)
- zfs_xvattr_set(*zpp, (xvattr_t *)vap);
+ zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = mode;
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
+ ASSERT3P(err, ==, 0);
+ }
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
}
+/*
+ * zfs_xvattr_set only updates the in-core attributes
+ * it is assumed the caller will be doing an sa_bulk_update
+ * to push the changes out
+ */
void
-zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
{
xoptattr_t *xoap;
@@ -853,62 +1015,74 @@
ASSERT(xoap);
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
- ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+ ×, sizeof (times), tx);
XVA_SET_RTN(xvap, XAT_CREATETIME);
}
if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
- ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_READONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
- ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_HIDDEN);
}
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
- ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_SYSTEM);
}
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
- ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_ARCHIVE);
}
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
- ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_IMMUTABLE);
}
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
- ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_NOUNLINK);
}
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
- ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_APPENDONLY);
}
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
- ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_NODUMP);
}
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
- ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_OPAQUE);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
- xoap->xoa_av_quarantined);
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
- ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
}
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
- (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
- sizeof (xoap->xoa_av_scanstamp));
- zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
+ zfs_sa_set_scanstamp(zp, xvap, tx);
XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
- ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse);
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
XVA_SET_RTN(xvap, XAT_REPARSE);
}
}
@@ -920,35 +1094,42 @@
dmu_buf_t *db;
znode_t *zp;
int err;
+ sa_handle_t *hdl;
*zpp = NULL;
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
}
dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, NULL);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EINVAL);
}
- zp = dmu_buf_get_user(db);
- if (zp != NULL) {
- mutex_enter(&zp->z_lock);
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
/*
- * Since we do immediate eviction of the z_dbuf, we
- * should never find a dbuf with a znode that doesn't
- * know about the dbuf.
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
*/
- ASSERT3P(zp->z_dbuf, ==, db);
+
+ ASSERT3P(zp, !=, NULL);
+
+ mutex_enter(&zp->z_lock);
ASSERT3U(zp->z_id, ==, obj_num);
if (zp->z_unlinked) {
err = ENOENT;
@@ -957,7 +1138,7 @@
*zpp = zp;
err = 0;
}
- dmu_buf_rele(db, NULL);
+ sa_buf_rele(db, NULL);
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
@@ -969,18 +1150,17 @@
*
* There is a small window where zfs_vget() could
* find this object while a file create is still in
- * progress. Since a gen number can never be zero
- * we will check that to determine if its an allocated
- * file.
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
*/
-
- if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
- zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = ENOENT;
+ } else {
*zpp = zp;
- err = 0;
- } else {
- dmu_buf_rele(db, NULL);
- err = ENOENT;
}
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
@@ -993,39 +1173,76 @@
dmu_object_info_t doi;
dmu_buf_t *db;
uint64_t obj_num = zp->z_id;
+ uint64_t mode;
+ uint64_t uid, gid;
+ sa_bulk_attr_t bulk[8];
int err;
+ int count = 0;
+ uint64_t gen;
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
}
dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, NULL);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EINVAL);
}
- if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
- dmu_buf_rele(db, NULL);
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &zp->z_links, sizeof (zp->z_links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &zp->z_atime, sizeof (zp->z_atime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &uid, sizeof (uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &gid, sizeof (gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+
+ zp->z_mode = mode;
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EIO);
}
- mutex_enter(&zp->z_acl_lock);
- if (zp->z_acl_cached) {
- zfs_acl_free(zp->z_acl_cached);
- zp->z_acl_cached = NULL;
+ if (gen != zp->z_gen) {
+ zfs_znode_dmu_fini(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EIO);
}
- mutex_exit(&zp->z_acl_lock);
- zfs_znode_dmu_init(zfsvfs, zp, db);
- zp->z_unlinked = (zp->z_phys->zp_links == 0);
+ zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER);
+ zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP);
+ zp->z_unlinked = (zp->z_links == 0);
zp->z_blksz = doi.doi_data_block_size;
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
@@ -1039,7 +1256,7 @@
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
objset_t *os = zfsvfs->z_os;
uint64_t obj = zp->z_id;
- uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+ uint64_t acl_obj = ZFS_EXTERNAL_ACL(zp);
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
if (acl_obj)
@@ -1057,7 +1274,7 @@
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
uint64_t z_id = zp->z_id;
- ASSERT(zp->z_dbuf && zp->z_phys);
+ ASSERT(zp->z_sa_hdl);
/*
* Don't allow a zfs_zget() while were trying to release this znode
@@ -1096,6 +1313,7 @@
zfs_rmnode(zp);
return;
}
+
mutex_exit(&zp->z_lock);
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
@@ -1127,59 +1345,40 @@
}
void
-zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2], boolean_t have_tx)
{
timestruc_t now;
- ASSERT(MUTEX_HELD(&zp->z_lock));
-
gethrestime(&now);
- if (tx) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
+ if (have_tx) { /* will sa_bulk_update happen really soon? */
zp->z_atime_dirty = 0;
zp->z_seq++;
} else {
zp->z_atime_dirty = 1;
}
- if (flag & AT_ATIME)
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+ if (flag & AT_ATIME) {
+ ZFS_TIME_ENCODE(&now, zp->z_atime);
+ }
if (flag & AT_MTIME) {
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
- if (zp->z_zfsvfs->z_use_fuids)
- zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
+ ZFS_TIME_ENCODE(&now, mtime);
+ if (zp->z_zfsvfs->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
}
if (flag & AT_CTIME) {
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+ ZFS_TIME_ENCODE(&now, ctime);
if (zp->z_zfsvfs->z_use_fuids)
- zp->z_phys->zp_flags |= ZFS_ARCHIVE;
+ zp->z_pflags |= ZFS_ARCHIVE;
}
}
/*
- * Update the requested znode timestamps with the current time.
- * If we are in a transaction, then go ahead and mark the znode
- * dirty in the transaction so the timestamps will go to disk.
- * Otherwise, we will get pushed next time the znode is updated
- * in a transaction, or when this znode eventually goes inactive.
- *
- * Why is this OK?
- * 1 - Only the ACCESS time is ever updated outside of a transaction.
- * 2 - Multiple consecutive updates will be collapsed into a single
- * znode update by the transaction grouping semantics of the DMU.
- */
-void
-zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
- mutex_enter(&zp->z_lock);
- zfs_time_stamper_locked(zp, flag, tx);
- mutex_exit(&zp->z_lock);
-}
-
-/*
* Grow the block size for a file.
*
* IN: zp - znode of file to free data in.
@@ -1201,17 +1400,18 @@
* we will not grow. If there is more than one block in a file,
* the blocksize cannot change.
*/
- if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
return;
error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
size, 0, tx);
+
if (error == ENOTSUP)
return;
ASSERT3U(error, ==, 0);
/* What blocksize did we actually get? */
- dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
}
/*
@@ -1254,13 +1454,14 @@
/*
* Nothing to do if file already at desired length.
*/
- if (end <= zp->z_phys->zp_size) {
+ if (end <= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
if (end > zp->z_blksz &&
(!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
/*
@@ -1288,12 +1489,14 @@
zfs_range_unlock(rl);
return (error);
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
if (newblksz)
zfs_grow_blocksize(zp, newblksz, tx);
- zp->z_phys->zp_size = end;
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx));
zfs_range_unlock(rl);
@@ -1327,13 +1530,13 @@
/*
* Nothing to do if file already at desired length.
*/
- if (off >= zp->z_phys->zp_size) {
+ if (off >= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
- if (off + len > zp->z_phys->zp_size)
- len = zp->z_phys->zp_size - off;
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
@@ -1368,7 +1571,7 @@
/*
* Nothing to do if file already at desired length.
*/
- if (end >= zp->z_phys->zp_size) {
+ if (end >= zp->z_size) {
zfs_range_unlock(rl);
return (0);
}
@@ -1380,7 +1583,8 @@
}
top:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -1392,9 +1596,11 @@
zfs_range_unlock(rl);
return (error);
}
- dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ zp->z_size = end;
- zp->z_phys->zp_size = end;
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx));
dmu_tx_commit(tx);
@@ -1446,9 +1652,17 @@
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
int error;
- if (off > zp->z_phys->zp_size) {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
error = zfs_extend(zp, off+len);
if (error == 0 && log)
goto log;
@@ -1459,8 +1673,9 @@
/*
* Check for any locks in the region to be freed.
*/
- if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
- uint64_t length = (len ? len : zp->z_phys->zp_size - off);
+
+ if (MANDLOCK(vp, (mode_t)mode)) {
+ uint64_t length = (len ? len : zp->z_size - off);
if (error = chklock(vp, FWRITE, off, length, flag, NULL))
return (error);
}
@@ -1469,14 +1684,15 @@
error = zfs_trunc(zp, off);
} else {
if ((error = zfs_free_range(zp, off, len)) == 0 &&
- off + len > zp->z_phys->zp_size)
+ off + len > zp->z_size)
error = zfs_extend(zp, off+len);
}
if (error || !log)
return (error);
log:
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
@@ -1488,7 +1704,12 @@
return (error);
}
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
dmu_tx_commit(tx);
@@ -1499,7 +1720,7 @@
zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
{
zfsvfs_t zfsvfs;
- uint64_t moid, obj, version;
+ uint64_t moid, obj, sa_obj, version;
uint64_t sense = ZFS_CASE_SENSITIVE;
uint64_t norm = 0;
nvpair_t *elem;
@@ -1526,12 +1747,7 @@
/*
* Set starting attributes.
*/
- if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
- version = ZPL_VERSION;
- else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
- version = ZPL_VERSION_USERSPACE - 1;
- else
- version = ZPL_VERSION_FUID - 1;
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
elem = NULL;
while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
/* For the moment we expect all zpl props to be uint64_ts */
@@ -1557,6 +1773,18 @@
error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
/*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
* Create a delete queue.
*/
obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
@@ -1577,6 +1805,7 @@
rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
rootzp->z_unlinked = 0;
rootzp->z_atime_dirty = 0;
+ rootzp->z_is_sa = USE_SA(version, os);
vp = ZTOV(rootzp);
vn_reinit(vp);
@@ -1588,7 +1817,11 @@
zfsvfs.z_parent = &zfsvfs;
zfsvfs.z_version = version;
zfsvfs.z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs.z_use_sa = USE_SA(version, os);
zfsvfs.z_norm = norm;
+
+ zfsvfs.z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END);
+
/*
* Fold case on file systems that are always or sometimes case
* insensitive.
@@ -1607,7 +1840,7 @@
rootzp->z_zfsvfs = &zfsvfs;
VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids));
- zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, rootzp);
ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
@@ -1616,8 +1849,7 @@
POINTER_INVALIDATE(&rootzp->z_zfsvfs);
ZTOV(rootzp)->v_count = 0;
- dmu_buf_rele(rootzp->z_dbuf, NULL);
- rootzp->z_dbuf = NULL;
+ sa_handle_destroy(rootzp->z_sa_hdl);
kmem_cache_free(znode_cache, rootzp);
/*
@@ -1633,33 +1865,59 @@
}
#endif /* _KERNEL */
+
/*
* Given an object number, return its parent object number and whether
* or not the object is an extended attribute directory.
*/
static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir,
+ sa_attr_type_t *sa_table)
{
dmu_buf_t *db;
dmu_object_info_t doi;
- znode_phys_t *zp;
int error;
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ sa_bulk_attr_t bulk[3];
+ sa_handle_t *hdl;
+ int count = 0;
- if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+ if ((error = sa_buf_hold(osp, obj, FTAG, &db)) != 0)
return (error);
dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ doi.doi_bonus_type == DMU_OT_ZNODE &&
doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, FTAG);
+ sa_buf_rele(db, FTAG);
return (EINVAL);
}
- zp = db->db_data;
- *pobjp = zp->zp_parent;
- *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
- S_ISDIR(zp->zp_mode);
- dmu_buf_rele(db, FTAG);
+ if ((error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE,
+ &hdl)) != 0) {
+ sa_buf_rele(db, FTAG);
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT],
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, 8);
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) {
+ sa_buf_rele(db, FTAG);
+ sa_handle_destroy(hdl);
+ return (error);
+ }
+ *pobjp = parent;
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, FTAG);
return (0);
}
@@ -1668,10 +1926,19 @@
zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
{
char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
int error;
+ uint64_t sa_obj = 0;
*path = '\0';
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ sa_table = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END);
+
for (;;) {
uint64_t pobj;
char component[MAXNAMELEN + 2];
@@ -1679,7 +1946,7 @@
int is_xattrdir;
if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
- &is_xattrdir)) != 0)
+ &is_xattrdir, sa_table)) != 0)
break;
if (pobj == obj) {
@@ -1707,5 +1974,6 @@
if (error == 0)
(void) memmove(buf, path, buf + len - path);
+
return (error);
}
--- a/usr/src/uts/common/sys/fs/zfs.h Tue Mar 16 06:44:44 2010 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h Tue Mar 16 09:43:38 2010 -0600
@@ -324,14 +324,15 @@
#define SPA_VERSION_21 21ULL
#define SPA_VERSION_22 22ULL
#define SPA_VERSION_23 23ULL
+#define SPA_VERSION_24 24ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_23
-#define SPA_VERSION_STRING "23"
+#define SPA_VERSION SPA_VERSION_24
+#define SPA_VERSION_STRING "24"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -376,6 +377,7 @@
#define SPA_VERSION_DEDUP SPA_VERSION_21
#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
+#define SPA_VERSION_SA SPA_VERSION_24
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -389,8 +391,9 @@
#define ZPL_VERSION_2 2ULL
#define ZPL_VERSION_3 3ULL
#define ZPL_VERSION_4 4ULL
-#define ZPL_VERSION ZPL_VERSION_4
-#define ZPL_VERSION_STRING "4"
+#define ZPL_VERSION_5 5ULL
+#define ZPL_VERSION ZPL_VERSION_5
+#define ZPL_VERSION_STRING "5"
#define ZPL_VERSION_INITIAL ZPL_VERSION_1
#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
@@ -398,6 +401,7 @@
#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
#define ZPL_VERSION_SYSATTR ZPL_VERSION_3
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
+#define ZPL_VERSION_SA ZPL_VERSION_5
/* Rewind request information */
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */