# HG changeset patch # User Mark Shellenbaum # Date 1268754218 21600 # Node ID 538c866aaac6d99cba53561d5b3ba29ec7fe7cae # Parent fdae577692c4cbaf4b680504f52c3f7a8e737ce8 6716117 ZFS needs native system attribute infrastructure 6516171 zpl symlinks should have their own object type diff -r fdae577692c4 -r 538c866aaac6 usr/src/cmd/mdb/common/modules/zfs/zfs.c --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c Tue Mar 16 09:43:38 2010 -0600 @@ -38,6 +38,8 @@ #include #include #include +#include +#include #ifndef _KERNEL #include "../genunix/list.h" @@ -217,7 +219,7 @@ objset_name(uintptr_t addr, char *buf) { static int gotid; - static mdb_ctf_id_t os_id, ds_id; + static mdb_ctf_id_t osi_id, ds_id; uintptr_t os_dsl_dataset; char ds_snapname[MAXNAMELEN]; uintptr_t ds_dir; @@ -225,9 +227,9 @@ buf[0] = '\0'; if (!gotid) { - if (mdb_ctf_lookup_by_name("struct objset", - &os_id) == -1) { - mdb_warn("couldn't find struct objset"); + if (mdb_ctf_lookup_by_name("struct objset_impl", + &osi_id) == -1) { + mdb_warn("couldn't find struct objset_impl"); return (DCMD_ERR); } if (mdb_ctf_lookup_by_name("struct dsl_dataset", @@ -239,7 +241,7 @@ gotid = TRUE; } - if (GETMEMBID(addr, &os_id, os_dsl_dataset, os_dsl_dataset)) + if (GETMEMBID(addr, &osi_id, os_dsl_dataset, os_dsl_dataset)) return (DCMD_ERR); if (os_dsl_dataset == 0) { @@ -429,7 +431,7 @@ (void) mdb_snprintf(objectname, sizeof (objectname), "%llx", (u_longlong_t)db.db_object); - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) (void) strcpy(blkidname, "bonus"); else (void) mdb_snprintf(blkidname, sizeof (blkidname), "%llx", @@ -716,7 +718,7 @@ if (blkid) { if (strcmp(blkid, "bonus") == 0) { - data.blkid = DB_BONUS_BLKID; + data.blkid = DMU_BONUS_BLKID; } else { data.blkid = mdb_strtoull(blkid); } @@ -2291,6 +2293,602 @@ return (DCMD_OK); } +/* ARGSUSED */ +static int +sa_attr_table(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + sa_attr_table_t *table; + sa_os_t sa_os; + char *name; + int i; + + if (mdb_vread(&sa_os, sizeof (sa_os_t), addr) == -1) { + mdb_warn("failed to read sa_os at %p", addr); + return (DCMD_ERR); + } + + table = mdb_alloc(sizeof (sa_attr_table_t) * sa_os.sa_num_attrs, + UM_SLEEP | UM_GC); + name = mdb_alloc(MAXPATHLEN, UM_SLEEP | UM_GC); + + if (mdb_vread(table, sizeof (sa_attr_table_t) * sa_os.sa_num_attrs, + (uintptr_t)sa_os.sa_attr_table) == -1) { + mdb_warn("failed to read sa_os at %p", addr); + return (DCMD_ERR); + } + + mdb_printf("%%-10s %-10s %-10s %-10s %s%\n", + "ATTR ID", "REGISTERED", "LENGTH", "BSWAP", "NAME"); + for (i = 0; i != sa_os.sa_num_attrs; i++) { + mdb_readstr(name, MAXPATHLEN, (uintptr_t)table[i].sa_name); + mdb_printf("%5x %8x %8x %8x %-s\n", + (int)table[i].sa_attr, (int)table[i].sa_registered, + (int)table[i].sa_length, table[i].sa_byteswap, name); + } + + return (DCMD_OK); +} + +static int +sa_get_off_table(uintptr_t addr, uint32_t **off_tab, int attr_count) +{ + uintptr_t idx_table; + + if (GETMEMB(addr, struct sa_idx_tab, sa_idx_tab, idx_table)) { + mdb_printf("can't find offset table in sa_idx_tab\n"); + return (-1); + } + + *off_tab = mdb_alloc(attr_count * sizeof (uint32_t), + UM_SLEEP | UM_GC); + + if (mdb_vread(*off_tab, + attr_count * sizeof (uint32_t), idx_table) == -1) { + mdb_warn("failed to attribute offset table %p", idx_table); + return (-1); + } + + return (DCMD_OK); +} + +/*ARGSUSED*/ +static int +sa_attr_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + uint32_t *offset_tab; + int attr_count; + uint64_t attr_id; + uintptr_t attr_addr; + uintptr_t bonus_tab, spill_tab; + uintptr_t db_bonus, db_spill; + uintptr_t os, os_sa; + uintptr_t db_data; + + if (argc != 1) + return (DCMD_USAGE); + + if (argv[0].a_type == MDB_TYPE_STRING) + attr_id = mdb_strtoull(argv[0].a_un.a_str); + else + return (DCMD_USAGE); + + if (GETMEMB(addr, struct sa_handle, sa_bonus_tab, bonus_tab) || + GETMEMB(addr, struct sa_handle, sa_spill_tab, spill_tab) || + GETMEMB(addr, struct sa_handle, sa_os, os) || + GETMEMB(addr, struct sa_handle, sa_bonus, db_bonus) || + GETMEMB(addr, struct sa_handle, sa_spill, db_spill)) { + mdb_printf("Can't find necessary information in sa_handle " + "in sa_handle\n"); + return (DCMD_ERR); + } + + if (GETMEMB(os, struct objset, os_sa, os_sa)) { + mdb_printf("Can't find os_sa in objset\n"); + return (DCMD_ERR); + } + + if (GETMEMB(os_sa, struct sa_os, sa_num_attrs, attr_count)) { + mdb_printf("Can't find sa_num_attrs\n"); + return (DCMD_ERR); + } + + if (attr_id > attr_count) { + mdb_printf("attribute id number is out of range\n"); + return (DCMD_ERR); + } + + if (bonus_tab) { + if (sa_get_off_table(bonus_tab, &offset_tab, + attr_count) == -1) { + return (DCMD_ERR); + } + + if (GETMEMB(db_bonus, struct dmu_buf, db_data, db_data)) { + mdb_printf("can't find db_data in bonus dbuf\n"); + return (DCMD_ERR); + } + } + + if (bonus_tab && !TOC_ATTR_PRESENT(offset_tab[attr_id]) && + spill_tab == NULL) { + mdb_printf("Attribute does not exist\n"); + return (DCMD_ERR); + } else if (!TOC_ATTR_PRESENT(offset_tab[attr_id]) && spill_tab) { + if (sa_get_off_table(spill_tab, &offset_tab, + attr_count) == -1) { + return (DCMD_ERR); + } + if (GETMEMB(db_spill, struct dmu_buf, db_data, db_data)) { + mdb_printf("can't find db_data in spill dbuf\n"); + return (DCMD_ERR); + } + if (!TOC_ATTR_PRESENT(offset_tab[attr_id])) { + mdb_printf("Attribute does not exist\n"); + return (DCMD_ERR); + } + } + attr_addr = db_data + TOC_OFF(offset_tab[attr_id]); + mdb_printf("%p\n", attr_addr); + return (DCMD_OK); +} + +/* ARGSUSED */ +static int +zfs_ace_print_common(uintptr_t addr, uint_t flags, + uint64_t id, uint32_t access_mask, uint16_t ace_flags, + uint16_t ace_type, int verbose) +{ + if (DCMD_HDRSPEC(flags) && !verbose) + mdb_printf("%%-?s %-8s %-8s %-8s %s%\n", + "ADDR", "FLAGS", "MASK", "TYPE", "ID"); + + if (!verbose) { + mdb_printf("%0?p %-8x %-8x %-8x %-llx\n", addr, + ace_flags, access_mask, ace_type, id); + return (DCMD_OK); + } + + switch (ace_flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + mdb_printf("owner@:"); + break; + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + mdb_printf("group@:"); + break; + case ACE_EVERYONE: + mdb_printf("everyone@:"); + break; + case ACE_IDENTIFIER_GROUP: + mdb_printf("group:%llx:", (u_longlong_t)id); + break; + case 0: /* User entry */ + mdb_printf("user:%llx:", (u_longlong_t)id); + break; + } + + /* print out permission mask */ + if (access_mask & ACE_READ_DATA) + mdb_printf("r"); + else + mdb_printf("-"); + if (access_mask & ACE_WRITE_DATA) + mdb_printf("w"); + else + mdb_printf("-"); + if (access_mask & ACE_EXECUTE) + mdb_printf("x"); + else + mdb_printf("-"); + if (access_mask & ACE_APPEND_DATA) + mdb_printf("p"); + else + mdb_printf("-"); + if (access_mask & ACE_DELETE) + mdb_printf("d"); + else + mdb_printf("-"); + if (access_mask & ACE_DELETE_CHILD) + mdb_printf("D"); + else + mdb_printf("-"); + if (access_mask & ACE_READ_ATTRIBUTES) + mdb_printf("a"); + else + mdb_printf("-"); + if (access_mask & ACE_WRITE_ATTRIBUTES) + mdb_printf("A"); + else + mdb_printf("-"); + if (access_mask & ACE_READ_NAMED_ATTRS) + mdb_printf("R"); + else + mdb_printf("-"); + if (access_mask & ACE_WRITE_NAMED_ATTRS) + mdb_printf("W"); + else + mdb_printf("-"); + if (access_mask & ACE_READ_ACL) + mdb_printf("c"); + else + mdb_printf("-"); + if (access_mask & ACE_WRITE_ACL) + mdb_printf("C"); + else + mdb_printf("-"); + if (access_mask & ACE_WRITE_OWNER) + mdb_printf("o"); + else + mdb_printf("-"); + if (access_mask & ACE_SYNCHRONIZE) + mdb_printf("s"); + else + mdb_printf("-"); + + mdb_printf(":"); + + /* Print out inheritance flags */ + if (ace_flags & ACE_FILE_INHERIT_ACE) + mdb_printf("f"); + else + mdb_printf("-"); + if (ace_flags & ACE_DIRECTORY_INHERIT_ACE) + mdb_printf("d"); + else + mdb_printf("-"); + if (ace_flags & ACE_INHERIT_ONLY_ACE) + mdb_printf("i"); + else + mdb_printf("-"); + if (ace_flags & ACE_NO_PROPAGATE_INHERIT_ACE) + mdb_printf("n"); + else + mdb_printf("-"); + if (ace_flags & ACE_SUCCESSFUL_ACCESS_ACE_FLAG) + mdb_printf("S"); + else + mdb_printf("-"); + if (ace_flags & ACE_FAILED_ACCESS_ACE_FLAG) + mdb_printf("F"); + else + mdb_printf("-"); + if (ace_flags & ACE_INHERITED_ACE) + mdb_printf("I"); + else + mdb_printf("-"); + + switch (ace_type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + mdb_printf(":allow\n"); + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + mdb_printf(":deny\n"); + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + mdb_printf(":audit\n"); + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + mdb_printf(":alarm\n"); + break; + default: + mdb_printf(":?\n"); + } + return (DCMD_OK); +} + +/* ARGSUSED */ +static int +zfs_ace_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + zfs_ace_t zace; + int verbose = FALSE; + uint64_t id; + + if (!(flags & DCMD_ADDRSPEC)) + return (DCMD_USAGE); + + if (mdb_getopts(argc, argv, + 'v', MDB_OPT_SETBITS, TRUE, &verbose, TRUE, NULL) != argc) + return (DCMD_USAGE); + + if (mdb_vread(&zace, sizeof (zfs_ace_t), addr) == -1) { + mdb_warn("failed to read zfs_ace_t"); + return (DCMD_ERR); + } + + if ((zace.z_hdr.z_flags & ACE_TYPE_FLAGS) == 0 || + (zace.z_hdr.z_flags & ACE_TYPE_FLAGS) == ACE_IDENTIFIER_GROUP) + id = zace.z_fuid; + else + id = -1; + + return (zfs_ace_print_common(addr, flags, id, zace.z_hdr.z_access_mask, + zace.z_hdr.z_flags, zace.z_hdr.z_type, verbose)); +} + +/* ARGSUSED */ +static int +zfs_ace0_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + ace_t ace; + uint64_t id; + int verbose = FALSE; + + if (!(flags & DCMD_ADDRSPEC)) + return (DCMD_USAGE); + + if (mdb_getopts(argc, argv, + 'v', MDB_OPT_SETBITS, TRUE, &verbose, TRUE, NULL) != argc) + return (DCMD_USAGE); + + if (mdb_vread(&ace, sizeof (ace_t), addr) == -1) { + mdb_warn("failed to read ace_t"); + return (DCMD_ERR); + } + + if ((ace.a_flags & ACE_TYPE_FLAGS) == 0 || + (ace.a_flags & ACE_TYPE_FLAGS) == ACE_IDENTIFIER_GROUP) + id = ace.a_who; + else + id = -1; + + return (zfs_ace_print_common(addr, flags, id, ace.a_access_mask, + ace.a_flags, ace.a_type, verbose)); +} + +typedef struct acl_dump_args { + int a_argc; + const mdb_arg_t *a_argv; + uint16_t a_version; + int a_flags; +} acl_dump_args_t; + +/* ARGSUSED */ +static int +acl_aces_cb(uintptr_t addr, const void *unknown, void *arg) +{ + acl_dump_args_t *acl_args = (acl_dump_args_t *)arg; + + if (acl_args->a_version == 1) { + if (mdb_call_dcmd("zfs_ace", addr, + DCMD_ADDRSPEC|acl_args->a_flags, acl_args->a_argc, + acl_args->a_argv) != DCMD_OK) { + return (WALK_ERR); + } + } else { + if (mdb_call_dcmd("zfs_ace0", addr, + DCMD_ADDRSPEC|acl_args->a_flags, acl_args->a_argc, + acl_args->a_argv) != DCMD_OK) { + return (WALK_ERR); + } + } + acl_args->a_flags = DCMD_LOOP; + return (WALK_NEXT); +} + +/* ARGSUSED */ +static int +acl_cb(uintptr_t addr, const void *unknown, void *arg) +{ + acl_dump_args_t *acl_args = (acl_dump_args_t *)arg; + + if (acl_args->a_version == 1) { + if (mdb_pwalk("zfs_acl_node_aces", acl_aces_cb, + arg, addr) != 0) { + mdb_warn("can't walk ACEs"); + return (DCMD_ERR); + } + } else { + if (mdb_pwalk("zfs_acl_node_aces0", acl_aces_cb, + arg, addr) != 0) { + mdb_warn("can't walk ACEs"); + return (DCMD_ERR); + } + } + return (WALK_NEXT); +} + +/* ARGSUSED */ +static int +zfs_acl_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + zfs_acl_t zacl; + int verbose = FALSE; + acl_dump_args_t acl_args; + + if (!(flags & DCMD_ADDRSPEC)) + return (DCMD_USAGE); + + if (mdb_getopts(argc, argv, + 'v', MDB_OPT_SETBITS, TRUE, &verbose, TRUE, NULL) != argc) + return (DCMD_USAGE); + + if (mdb_vread(&zacl, sizeof (zfs_acl_t), addr) == -1) { + mdb_warn("failed to read zfs_acl_t"); + return (DCMD_ERR); + } + + acl_args.a_argc = argc; + acl_args.a_argv = argv; + acl_args.a_version = zacl.z_version; + acl_args.a_flags = DCMD_LOOPFIRST; + + if (mdb_pwalk("zfs_acl_node", acl_cb, &acl_args, addr) != 0) { + mdb_warn("can't walk ACL"); + return (DCMD_ERR); + } + + return (DCMD_OK); +} + +/* ARGSUSED */ +static int +zfs_acl_node_walk_init(mdb_walk_state_t *wsp) +{ + if (wsp->walk_addr == NULL) { + mdb_warn("must supply address of zfs_acl_node_t\n"); + return (WALK_ERR); + } + + wsp->walk_addr += OFFSETOF(zfs_acl_t, z_acl); + + if (mdb_layered_walk("list", wsp) == -1) { + mdb_warn("failed to walk 'list'\n"); + return (WALK_ERR); + } + + return (WALK_NEXT); +} + +static int +zfs_acl_node_walk_step(mdb_walk_state_t *wsp) +{ + zfs_acl_node_t aclnode; + + if (mdb_vread(&aclnode, sizeof (zfs_acl_node_t), + wsp->walk_addr) == -1) { + mdb_warn("failed to read zfs_acl_node at %p", wsp->walk_addr); + return (WALK_ERR); + } + + return (wsp->walk_callback(wsp->walk_addr, &aclnode, wsp->walk_cbdata)); +} + +typedef struct ace_walk_data { + int ace_count; + int ace_version; +} ace_walk_data_t; + +static int +zfs_aces_walk_init_common(mdb_walk_state_t *wsp, int version, + int ace_count, uintptr_t ace_data) +{ + ace_walk_data_t *ace_walk_data; + + if (wsp->walk_addr == NULL) { + mdb_warn("must supply address of zfs_acl_node_t\n"); + return (WALK_ERR); + } + + ace_walk_data = mdb_alloc(sizeof (ace_walk_data_t), UM_SLEEP | UM_GC); + + ace_walk_data->ace_count = ace_count; + ace_walk_data->ace_version = version; + + wsp->walk_addr = ace_data; + wsp->walk_data = ace_walk_data; + + return (WALK_NEXT); +} + +static int +zfs_acl_node_aces_walk_init_common(mdb_walk_state_t *wsp, int version) +{ + static int gotid; + static mdb_ctf_id_t acl_id; + int z_ace_count; + uintptr_t z_acldata; + + if (!gotid) { + if (mdb_ctf_lookup_by_name("struct zfs_acl_node", + &acl_id) == -1) { + mdb_warn("couldn't find struct zfs_acl_node"); + return (DCMD_ERR); + } + gotid = TRUE; + } + + if (GETMEMBID(wsp->walk_addr, &acl_id, z_ace_count, z_ace_count)) { + return (DCMD_ERR); + } + if (GETMEMBID(wsp->walk_addr, &acl_id, z_acldata, z_acldata)) { + return (DCMD_ERR); + } + + return (zfs_aces_walk_init_common(wsp, version, + z_ace_count, z_acldata)); +} + +/* ARGSUSED */ +static int +zfs_acl_node_aces_walk_init(mdb_walk_state_t *wsp) +{ + return (zfs_acl_node_aces_walk_init_common(wsp, 1)); +} + +/* ARGSUSED */ +static int +zfs_acl_node_aces0_walk_init(mdb_walk_state_t *wsp) +{ + return (zfs_acl_node_aces_walk_init_common(wsp, 0)); +} + +static int +zfs_aces_walk_step(mdb_walk_state_t *wsp) +{ + ace_walk_data_t *ace_data = wsp->walk_data; + zfs_ace_t zace; + ace_t *acep; + int status; + int entry_type; + int allow_type; + uintptr_t ptr; + + if (ace_data->ace_count == 0) + return (WALK_DONE); + + if (mdb_vread(&zace, sizeof (zfs_ace_t), wsp->walk_addr) == -1) { + mdb_warn("failed to read zfs_ace_t at %#lx", + wsp->walk_addr); + return (WALK_ERR); + } + + switch (ace_data->ace_version) { + case 0: + acep = (ace_t *)&zace; + entry_type = acep->a_flags & ACE_TYPE_FLAGS; + allow_type = acep->a_type; + break; + case 1: + entry_type = zace.z_hdr.z_flags & ACE_TYPE_FLAGS; + allow_type = zace.z_hdr.z_type; + break; + default: + return (WALK_ERR); + } + + ptr = (uintptr_t)wsp->walk_addr; + switch (entry_type) { + case ACE_OWNER: + case ACE_EVERYONE: + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + ptr += ace_data->ace_version == 0 ? + sizeof (ace_t) : sizeof (zfs_ace_hdr_t); + break; + case ACE_IDENTIFIER_GROUP: + default: + switch (allow_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + ptr += ace_data->ace_version == 0 ? + sizeof (ace_t) : sizeof (zfs_object_ace_t); + break; + default: + ptr += ace_data->ace_version == 0 ? + sizeof (ace_t) : sizeof (zfs_ace_t); + break; + } + } + + ace_data->ace_count--; + status = wsp->walk_callback(wsp->walk_addr, + (void *)(uintptr_t)&zace, wsp->walk_cbdata); + + wsp->walk_addr = ptr; + return (status); +} + /* * MDB module linkage information: * @@ -2304,7 +2902,7 @@ { "dbuf", ":", "print dmu_buf_impl_t", dbuf }, { "dbuf_stats", ":", "dbuf stats", dbuf_stats }, { "dbufs", - "\t[-O objset_t*] [-n objset_name | \"mos\"] " + "\t[-O objset_impl_t*] [-n objset_name | \"mos\"] " "[-o object | \"mdn\"] \n" "\t[-l level] [-b blkid | \"bonus\"]", "find dmu_buf_impl_t's that match specified criteria", dbufs }, @@ -2333,6 +2931,14 @@ { "zfs_params", "", "print zfs tunable parameters", zfs_params }, { "refcount", "", "print refcount_t holders", refcount }, { "zap_leaf", "", "print zap_leaf_phys_t", zap_leaf }, + { "zfs_aces", ":[-v]", "print all ACEs from a zfs_acl_t", + zfs_acl_dump }, + { "zfs_ace", ":[-v]", "print zfs_ace", zfs_ace_print }, + { "zfs_ace0", ":[-v]", "print zfs_ace0", zfs_ace0_print }, + { "sa_attr_table", ":", "print SA attribute table from sa_os_t", + sa_attr_table}, + { "sa_attr", ": attr_id", + "print SA attribute address when given sa_handle_t", sa_attr_print}, { NULL } }; @@ -2366,6 +2972,13 @@ spa_walk_init, spa_walk_step, NULL }, { "metaslab", "given a spa_t *, walk all metaslab_t structures", metaslab_walk_init, metaslab_walk_step, NULL }, + { "zfs_acl_node", "given a zfs_acl_t, walk all zfs_acl_nodes", + zfs_acl_node_walk_init, zfs_acl_node_walk_step, NULL }, + { "zfs_acl_node_aces", "given a zfs_acl_node_t, walk all ACEs", + zfs_acl_node_aces_walk_init, zfs_aces_walk_step, NULL }, + { "zfs_acl_node_aces0", + "given a zfs_acl_node_t, walk all ACEs as ace_t", + zfs_acl_node_aces0_walk_init, zfs_aces_walk_step, NULL }, { NULL } }; diff -r fdae577692c4 -r 538c866aaac6 usr/src/cmd/zdb/zdb.c --- a/usr/src/cmd/zdb/zdb.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/cmd/zdb/zdb.c Tue Mar 16 09:43:38 2010 -0600 @@ -34,6 +34,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -370,6 +373,71 @@ /*ARGSUSED*/ static void +dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + (void) printf(" %llx : [%d:%d:%d]\n", + (u_longlong_t)attr.za_first_integer, + (int)ATTR_LENGTH(attr.za_first_integer), + (int)ATTR_BSWAP(attr.za_first_integer), + (int)ATTR_NUM(attr.za_first_integer)); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + uint16_t *layout_attrs; + int i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = [", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + + VERIFY(attr.za_integer_length == 2); + layout_attrs = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + + VERIFY(zap_lookup(os, object, attr.za_name, + attr.za_integer_length, + attr.za_num_integers, layout_attrs) == 0); + + for (i = 0; i != attr.za_num_integers; i++) + (void) printf(" %d ", (int)layout_attrs[i]); + (void) printf("]\n"); + umem_free(layout_attrs, + attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; @@ -1106,6 +1174,8 @@ static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; +static boolean_t sa_loaded; +sa_attr_type_t *sa_attr_table; static void fuid_table_destroy() @@ -1138,12 +1208,12 @@ } static void -dump_uidgid(objset_t *os, znode_phys_t *zp) +dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; - uid_idx = FUID_INDEX(zp->zp_uid); - gid_idx = FUID_INDEX(zp->zp_gid); + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { @@ -1158,50 +1228,103 @@ fuid_table_loaded = B_TRUE; } - print_idstr(zp->zp_uid, "uid"); - print_idstr(zp->zp_gid, "gid"); + print_idstr(uid, "uid"); + print_idstr(gid, "gid"); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { - znode_phys_t *zp = data; + char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ + sa_handle_t *hdl; + uint64_t xattr, rdev, gen; + uint64_t uid, gid, mode, fsize, parent, links; + uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; time_t z_crtime, z_atime, z_mtime, z_ctime; - char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ + sa_bulk_attr_t bulk[11]; + int idx = 0; int error; - ASSERT(size >= sizeof (znode_phys_t)); + if (!sa_loaded) { + uint64_t sa_attrs = 0; + uint64_t version; + + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &version) == 0); + if (version >= ZPL_VERSION_SA) { + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, + 8, 1, &sa_attrs) == 0); + } + sa_attr_table = sa_setup(os, sa_attrs, + zfs_attr_table, ZPL_END); + sa_loaded = B_TRUE; + } + + if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { + (void) printf("Failed to get handle for SA znode\n"); + return; + } + + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, + &links, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], + NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, + &fsize, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, + acctm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, + modtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, + crtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, + chgtm, 16); + + if (sa_bulk_lookup(hdl, bulk, idx)) { + (void) sa_handle_destroy(hdl); + return; + } error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error != 0) { (void) snprintf(path, sizeof (path), "\?\?\?", (u_longlong_t)object); } - if (dump_opt['d'] < 3) { (void) printf("\t%s\n", path); + (void) sa_handle_destroy(hdl); return; } - z_crtime = (time_t)zp->zp_crtime[0]; - z_atime = (time_t)zp->zp_atime[0]; - z_mtime = (time_t)zp->zp_mtime[0]; - z_ctime = (time_t)zp->zp_ctime[0]; + z_crtime = (time_t)crtm[0]; + z_atime = (time_t)acctm[0]; + z_mtime = (time_t)modtm[0]; + z_ctime = (time_t)chgtm[0]; (void) printf("\tpath %s\n", path); - dump_uidgid(os, zp); + dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); - (void) printf("\tgen %llu\n", (u_longlong_t)zp->zp_gen); - (void) printf("\tmode %llo\n", (u_longlong_t)zp->zp_mode); - (void) printf("\tsize %llu\n", (u_longlong_t)zp->zp_size); - (void) printf("\tparent %llu\n", (u_longlong_t)zp->zp_parent); - (void) printf("\tlinks %llu\n", (u_longlong_t)zp->zp_links); - (void) printf("\txattr %llu\n", (u_longlong_t)zp->zp_xattr); - (void) printf("\trdev 0x%016llx\n", (u_longlong_t)zp->zp_rdev); + (void) printf("\tgen %llu\n", (u_longlong_t)gen); + (void) printf("\tmode %llo\n", (u_longlong_t)mode); + (void) printf("\tsize %llu\n", (u_longlong_t)fsize); + (void) printf("\tparent %llu\n", (u_longlong_t)parent); + (void) printf("\tlinks %llu\n", (u_longlong_t)links); + if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, + sizeof (uint64_t)) == 0) + (void) printf("\txattr %llu\n", (u_longlong_t)xattr); + if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, + sizeof (uint64_t)) == 0) + (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); + sa_handle_destroy(hdl); } /*ARGSUSED*/ @@ -1261,7 +1384,11 @@ dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ - dump_unknown /* Unknown type, must be last */ + dump_znode, /* SA object */ + dump_zap, /* SA Master Node */ + dump_sa_attrs, /* SA attribute registration */ + dump_sa_layouts, /* SA attribute layouts */ + dump_unknown, /* Unknown type, must be last */ }; static void @@ -1328,11 +1455,13 @@ } if (verbosity >= 4) { - (void) printf("\tdnode flags: %s%s\n", + (void) printf("\tdnode flags: %s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? - "USERUSED_ACCOUNTED " : ""); + "USERUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? + "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); @@ -1685,6 +1814,7 @@ dump_dir(os); dmu_objset_disown(os, FTAG); fuid_table_destroy(); + sa_loaded = B_FALSE; return (0); } @@ -2961,6 +3091,7 @@ (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG); fuid_table_destroy(); + sa_loaded = B_FALSE; libzfs_fini(g_zfs); kernel_fini(); diff -r fdae577692c4 -r 538c866aaac6 usr/src/cmd/zfs/Makefile --- a/usr/src/cmd/zfs/Makefile Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/cmd/zfs/Makefile Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2010 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -39,10 +39,12 @@ LDLIBS += -lzfs -luutil -lumem -lnvpair +INCS += -I../../common/zfs + C99MODE= -xc99=%all C99LMODE= -Xc99=%all -CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT +CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT $(INCS) $(NOT_RELEASE_BUILD)CPPFLAGS += -DDEBUG # lint complains about unused _umem_* functions diff -r fdae577692c4 -r 538c866aaac6 usr/src/cmd/zfs/zfs_main.c --- a/usr/src/cmd/zfs/zfs_main.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/cmd/zfs/zfs_main.c Tue Mar 16 09:43:38 2010 -0600 @@ -53,6 +53,7 @@ #include "zfs_iter.h" #include "zfs_util.h" +#include "zfs_comutil.h" libzfs_handle_t *g_zfs; @@ -1594,31 +1595,25 @@ { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - int i; - static struct { int zplver; int spaver; } table[] = { - {ZPL_VERSION_FUID, SPA_VERSION_FUID}, - {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, - {0, 0} - }; - - - for (i = 0; table[i].zplver; i++) { - if (cb->cb_version >= table[i].zplver) { - int spa_version; - - if (zfs_spa_version(zhp, &spa_version) < 0) - return (-1); - - if (spa_version < table[i].spaver) { - /* can't upgrade */ - (void) printf(gettext("%s: can not be " - "upgraded; the pool version needs to first " - "be upgraded\nto version %d\n\n"), - zfs_get_name(zhp), table[i].spaver); - cb->cb_numfailed++; - return (0); - } - } + int needed_spa_version; + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + needed_spa_version = zfs_spa_version_map(cb->cb_version); + + if (needed_spa_version < 0) + return (-1); + + if (spa_version < needed_spa_version) { + /* can't upgrade */ + (void) printf(gettext("%s: can not be " + "upgraded; the pool version needs to first " + "be upgraded\nto version %d\n\n"), + zfs_get_name(zhp), needed_spa_version); + cb->cb_numfailed++; + return (0); } /* upgrade */ @@ -1720,6 +1715,7 @@ "unique identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); + (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" diff -r fdae577692c4 -r 538c866aaac6 usr/src/cmd/zpool/zpool_main.c --- a/usr/src/cmd/zpool/zpool_main.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/cmd/zpool/zpool_main.c Tue Mar 16 09:43:38 2010 -0600 @@ -3889,6 +3889,7 @@ (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); + (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" diff -r fdae577692c4 -r 538c866aaac6 usr/src/cmd/zstreamdump/zstreamdump.c --- a/usr/src/cmd/zstreamdump/zstreamdump.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/cmd/zstreamdump/zstreamdump.c Tue Mar 16 09:43:38 2010 -0600 @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -88,6 +88,7 @@ struct drr_write *drrw = &thedrr.drr_u.drr_write; struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; struct drr_free *drrf = &thedrr.drr_u.drr_free; + struct drr_spill *drrs = &thedrr.drr_u.drr_spill; char c; boolean_t verbose = B_FALSE; boolean_t first = B_TRUE; @@ -378,6 +379,18 @@ (longlong_t)drrf->drr_length); } break; + case DRR_SPILL: + if (do_byteswap) { + drrs->drr_object = BSWAP_64(drrs->drr_object); + drrs->drr_length = BSWAP_64(drrs->drr_length); + } + if (verbose) { + (void) printf("SPILL block for object = %llu " + "length = %llu\n", drrs->drr_object, + drrs->drr_length); + } + (void) ssread(buf, drrs->drr_length, &zc); + break; } pcksum = zc; } @@ -398,12 +411,15 @@ (u_longlong_t)drr_record_count[DRR_WRITE]); (void) printf("\tTotal DRR_FREE records = %lld\n", (u_longlong_t)drr_record_count[DRR_FREE]); + (void) printf("\tTotal DRR_SPILL records = %lld\n", + (u_longlong_t)drr_record_count[DRR_SPILL]); (void) printf("\tTotal records = %lld\n", (u_longlong_t)(drr_record_count[DRR_BEGIN] + drr_record_count[DRR_OBJECT] + drr_record_count[DRR_FREEOBJECTS] + drr_record_count[DRR_WRITE] + drr_record_count[DRR_FREE] + + drr_record_count[DRR_SPILL] + drr_record_count[DRR_END])); (void) printf("\tTotal write size = %lld (0x%llx)\n", (u_longlong_t)total_write_size, (u_longlong_t)total_write_size); diff -r fdae577692c4 -r 538c866aaac6 usr/src/common/zfs/zfs_comutil.c --- a/usr/src/common/zfs/zfs_comutil.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/common/zfs/zfs_comutil.c Tue Mar 16 09:43:38 2010 -0600 @@ -39,6 +39,7 @@ #include #include #include +#include "zfs_comutil.h" /* * Are there allocatable vdevs? @@ -103,3 +104,56 @@ if (zrpp->zrp_request == 0) zrpp->zrp_request = ZPOOL_NO_REWIND; } + +typedef struct zfs_version_spa_map { + int version_zpl; + int version_spa; +} zfs_version_spa_map_t; + +/* + * Keep this table in monotonically increasing version number order. + */ +static zfs_version_spa_map_t zfs_version_table[] = { + {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL}, + {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL}, + {ZPL_VERSION_FUID, SPA_VERSION_FUID}, + {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, + {ZPL_VERSION_SA, SPA_VERSION_SA}, + {0, 0} +}; + +/* + * Return the max zpl version for a corresponding spa version + * -1 is returned if no mapping exists. + */ +int +zfs_zpl_version_map(int spa_version) +{ + int i; + int version = -1; + + for (i = 0; zfs_version_table[i].version_spa; i++) { + if (spa_version >= zfs_version_table[i].version_spa) + version = zfs_version_table[i].version_zpl; + } + + return (version); +} + +/* + * Return the min spa version for a corresponding spa version + * -1 is returned if no mapping exists. + */ +int +zfs_spa_version_map(int zpl_version) +{ + int i; + int version = -1; + + for (i = 0; zfs_version_table[i].version_zpl; i++) { + if (zfs_version_table[i].version_zpl >= zpl_version) + return (zfs_version_table[i].version_spa); + } + + return (version); +} diff -r fdae577692c4 -r 538c866aaac6 usr/src/common/zfs/zfs_comutil.h --- a/usr/src/common/zfs/zfs_comutil.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/common/zfs/zfs_comutil.h Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,6 +36,9 @@ extern boolean_t zfs_allocatable_devs(nvlist_t *); extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *); +extern int zfs_zpl_version_map(int spa_version); +extern int zfs_spa_version_map(int zpl_version); + #ifdef __cplusplus } #endif diff -r fdae577692c4 -r 538c866aaac6 usr/src/common/zfs/zfs_prop.c --- a/usr/src/common/zfs/zfs_prop.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/common/zfs/zfs_prop.c Tue Mar 16 09:43:38 2010 -0600 @@ -153,6 +153,7 @@ { "2", 2 }, { "3", 3 }, { "4", 4 }, + { "5", 5 }, { "current", ZPL_VERSION }, { NULL } }; diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/capability --- a/usr/src/grub/capability Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/grub/capability Tue Mar 16 09:43:38 2010 -0600 @@ -40,7 +40,7 @@ # This file and the associated version are Solaris specific and are # not a part of the open source distribution of GRUB. # -VERSION=15 +VERSION=16 dboot xVM zfs diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/grub-0.97/stage2/fsys_zfs.c --- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Tue Mar 16 09:43:38 2010 -0600 @@ -670,6 +670,7 @@ zapbuf = stack; size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; stack += size; + if (errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) return (errnum); @@ -1425,7 +1426,44 @@ } /* get the file size and set the file position to 0 */ - filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size; + + /* + * For DMU_OT_SA we will need to locate the SIZE attribute + * attribute, which could be either in the bonus buffer + * or the "spill" block. + */ + if (DNODE->dn_bonustype == DMU_OT_SA) { + sa_hdr_phys_t *sahdrp; + int hdrsize; + + sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE); + if (DNODE->dn_bonuslen != 0) { + sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE); + } else { + if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + blkptr_t *bp = &DNODE->dn_spill; + void *buf; + + buf = (void *)stack; + stack += BP_GET_LSIZE(bp); + + /* reset errnum to rawread() failure */ + errnum = 0; + if (zio_read(bp, buf, stack) != 0) { + return (0); + } + sahdrp = buf; + } else { + errnum = ERR_FSYS_CORRUPT; + return (0); + } + } + hdrsize = SA_HDR_SIZE(sahdrp); + filemax = *(uint64_t *)((char *)sahdrp + hdrsize + + SA_SIZE_OFFSET); + } else { + filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size; + } filepos = 0; dnode_buf = NULL; diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/grub-0.97/stage2/fsys_zfs.h --- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.h Tue Mar 16 09:43:38 2010 -0600 @@ -53,6 +53,7 @@ #include #include #include +#include /* * Global Memory addresses to store MOS and DNODE data diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/grub-0.97/stage2/zfs-include/dmu.h --- a/usr/src/grub/grub-0.97/stage2/zfs-include/dmu.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/dmu.h Tue Mar 16 09:43:38 2010 -0600 @@ -17,15 +17,13 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DMU_H #define _SYS_DMU_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file describes the interface that the DMU provides for its * consumers. @@ -75,7 +73,22 @@ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ - + DMU_OT_DSL_PERMS, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_SA, /* System attr */ + DMU_OT_SA_MASTER_NODE, /* ZAP */ + DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ + DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/grub-0.97/stage2/zfs-include/dnode.h --- a/usr/src/grub/grub-0.97/stage2/zfs-include/dnode.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/dnode.h Tue Mar 16 09:43:38 2010 -0600 @@ -17,15 +17,13 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DNODE_H #define _SYS_DNODE_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Fixed constants. */ @@ -49,6 +47,8 @@ #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) +#define DNODE_FLAG_SPILL_BLKPTR (1<<2) + #define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) @@ -72,7 +72,8 @@ uint64_t dn_pad3[4]; blkptr_t dn_blkptr[1]; - uint8_t dn_bonus[DN_MAX_BONUSLEN]; + uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)]; + blkptr_t dn_spill; } dnode_phys_t; #endif /* _SYS_DNODE_H */ diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/grub-0.97/stage2/zfs-include/sa_impl.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/sa_impl.h Tue Mar 16 09:43:38 2010 -0600 @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SA_IMPL_H +#define _SYS_SA_IMPL_H + +typedef struct sa_hdr_phys { + uint32_t sa_magic; + uint16_t sa_layout_info; + uint16_t sa_lengths[1]; +} sa_hdr_phys_t; + +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_SIZE_OFFSET 0x8 + +#endif /* _SYS_SA_IMPL_H */ diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h --- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Tue Mar 16 09:43:38 2010 -0600 @@ -27,7 +27,7 @@ /* * On-disk version number. */ -#define SPA_VERSION 23ULL +#define SPA_VERSION 24ULL /* * The following are configuration names used in the nvlist describing a pool's diff -r fdae577692c4 -r 538c866aaac6 usr/src/grub/grub-0.97/stage2/zfs-include/zfs_znode.h --- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs_znode.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs_znode.h Tue Mar 16 09:43:38 2010 -0600 @@ -17,7 +17,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,8 +27,9 @@ #define MASTER_NODE_OBJ 1 #define ZFS_ROOT_OBJ "ROOT" #define ZPL_VERSION_STR "VERSION" +#define ZFS_SA_ATTRS "SA_ATTRS" -#define ZPL_VERSION 4ULL +#define ZPL_VERSION 5ULL #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) diff -r fdae577692c4 -r 538c866aaac6 usr/src/lib/libzfs/common/libzfs_impl.h --- a/usr/src/lib/libzfs/common/libzfs_impl.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/lib/libzfs/common/libzfs_impl.h Tue Mar 16 09:43:38 2010 -0600 @@ -30,7 +30,6 @@ #include #include #include -#include #include #include diff -r fdae577692c4 -r 538c866aaac6 usr/src/lib/libzfs/common/libzfs_sendrecv.c --- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c Tue Mar 16 09:43:38 2010 -0600 @@ -203,6 +203,7 @@ struct drr_end *drre = &thedrr.drr_u.drr_end; struct drr_object *drro = &thedrr.drr_u.drr_object; struct drr_write *drrw = &thedrr.drr_u.drr_write; + struct drr_spill *drrs = &thedrr.drr_u.drr_spill; FILE *ofp; int outfd; dmu_replay_record_t wbr_drr = {0}; @@ -302,6 +303,18 @@ break; } + case DRR_SPILL: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + (void) ssread(buf, drrs->drr_length, ofp); + if (cksum_and_write(buf, drrs->drr_length, + &stream_cksum, outfd) == -1) + goto out; + break; + } + case DRR_FREEOBJECTS: { if (cksum_and_write(drr, sizeof (dmu_replay_record_t), @@ -1154,6 +1167,14 @@ dedup_arg_t dda = { 0 }; int featureflags = 0; + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { + uint64_t version; + version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (version >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + } + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); @@ -2180,7 +2201,14 @@ (void) recv_read(hdl, fd, buf, drr->drr_u.drr_write.drr_length, B_FALSE, NULL); break; - + case DRR_SPILL: + if (byteswap) { + drr->drr_u.drr_write.drr_length = + BSWAP_64(drr->drr_u.drr_spill.drr_length); + } + (void) recv_read(hdl, fd, buf, + drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); + break; case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: diff -r fdae577692c4 -r 538c866aaac6 usr/src/lib/libzfs/common/mapfile-vers --- a/usr/src/lib/libzfs/common/mapfile-vers Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/lib/libzfs/common/mapfile-vers Tue Mar 16 09:43:38 2010 -0600 @@ -134,6 +134,7 @@ zfs_smb_acl_rename; zfs_snapshot; zfs_spa_version; + zfs_spa_version_map; zfs_type_to_name; zfs_unmount; zfs_unmountall; @@ -146,6 +147,7 @@ zfs_unshareall_smb; zfs_userspace; zfs_userquota_prop_prefixes; + zfs_zpl_version_map; zpool_add; zpool_clear; zpool_clear_label; diff -r fdae577692c4 -r 538c866aaac6 usr/src/lib/libzpool/common/kernel.c --- a/usr/src/lib/libzpool/common/kernel.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/lib/libzpool/common/kernel.c Tue Mar 16 09:43:38 2010 -0600 @@ -776,6 +776,17 @@ return (0); } +int +ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) +{ + char *end; + + *result = strtoull(str, &end, base); + if (*result == 0) + return (errno); + return (0); +} + /* * ========================================================================= * kernel emulation setup & teardown diff -r fdae577692c4 -r 538c866aaac6 usr/src/lib/libzpool/common/llib-lzpool --- a/usr/src/lib/libzpool/common/llib-lzpool Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/lib/libzpool/common/llib-lzpool Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,6 +50,8 @@ #include #include #include +#include +#include extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; diff -r fdae577692c4 -r 538c866aaac6 usr/src/lib/libzpool/common/sys/zfs_context.h --- a/usr/src/lib/libzpool/common/sys/zfs_context.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h Tue Mar 16 09:43:38 2010 -0600 @@ -536,6 +536,9 @@ extern int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result); +extern int ddi_strtoull(const char *str, char **nptr, int base, + u_longlong_t *result); + /* ZFS Boot Related stuff. */ struct _buf { diff -r fdae577692c4 -r 538c866aaac6 usr/src/psm/stand/bootblks/zfs/common/zfs.fth --- a/usr/src/psm/stand/bootblks/zfs/common/zfs.fth Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/psm/stand/bootblks/zfs/common/zfs.fth Tue Mar 16 09:43:38 2010 -0600 @@ -19,13 +19,13 @@ \ CDDL HEADER END \ \ -\ Copyright 2009 Sun Microsystems, Inc. All rights reserved. +\ Copyright 2010 Sun Microsystems, Inc. All rights reserved. \ Use is subject to license terms. \ purpose: ZFS file system support package -copyright: Copyright 2009 Sun Microsystems, Inc. All Rights Reserved +copyright: Copyright 2010 Sun Microsystems, Inc. All Rights Reserved " /packages" get-package push-package @@ -395,13 +395,18 @@ \ ZFS dnode (DMU) routines \ + d# 44 constant ot-sa# + d# 512 constant /dnode - : dn_indblkshift ( dn -- n ) h# 1 + c@ ; - : dn_nlevels ( dn -- n ) h# 2 + c@ ; - : dn_datablkszsec ( dn -- n ) h# 8 + w@ ; - : dn_blkptr ( dn -- p ) h# 40 + ; - : dn_bonus ( dn -- p ) h# c0 + ; + : dn_indblkshift ( dn -- n ) h# 1 + c@ ; + : dn_nlevels ( dn -- n ) h# 2 + c@ ; + : dn_bonustype ( dn -- n ) h# 4 + c@ ; + : dn_datablkszsec ( dn -- n ) h# 8 + w@ ; + : dn_bonuslen ( dn -- n ) h# a + w@ ; + : dn_blkptr ( dn -- p ) h# 40 + ; + : dn_bonus ( dn -- p ) h# c0 + ; + : dn_spill ( dn -- p ) h# 180 + ; 0 instance value dnode @@ -755,7 +760,6 @@ 0 instance value mos-dn 0 instance value obj-dir 0 instance value root-dsl - 0 instance value root-dsl# 0 instance value fs-dn \ dn-cache contains dc-dn's contents at dc-blk# @@ -819,7 +823,6 @@ obj-dir " root_dataset" zap-lookup if " no root_dataset" die then ( obj# ) - dup to root-dsl# get-mos-dnode ( ) dnode root-dsl /dnode move ; @@ -888,6 +891,20 @@ \ 1 constant master-node# + + 0 instance value bootfs-obj# + 0 instance value root-obj# + 0 instance value current-obj# + 0 instance value search-obj# + + instance defer fsize ( dn -- size ) + instance defer mode ( dn -- mode ) + instance defer parent ( dn -- obj# ) + instance defer readlink ( dst dn -- ) + + \ + \ routines when bonus pool contains a znode + \ d# 264 constant /znode d# 56 constant /zn-slink @@ -895,15 +912,77 @@ : zp_size ( zn -- n ) h# 50 + x@ ; : zp_parent ( zn -- n ) h# 58 + x@ ; - 0 instance value bootfs-obj# - 0 instance value root-obj# - 0 instance value current-obj# - 0 instance value search-obj# - alias >znode dn_bonus - : fsize ( dn -- n ) >znode zp_size ; - : ftype ( dn -- n ) >znode zp_mode h# f000 and ; + : zn-fsize ( dn -- n ) >znode zp_size ; + : zn-mode ( dn -- n ) >znode zp_mode ; + : zn-parent ( dn -- n ) >znode zp_parent ; + + \ copy symlink target to dst + : zn-readlink ( dst dn -- ) + dup zn-fsize tuck /zn-slink > if ( dst size dn ) + \ contents in 1st block + temp-space over dn-bsize ( dst size dn t-adr bsize ) + rot 0 lblk#>bp read-bp ( dst size ) + temp-space ( dst size src ) + else ( dst size dn ) + \ contents in dnode + >znode /znode + ( dst size src ) + then ( dst size src ) + -rot move ( ) + ; + + \ + \ routines when bonus pool contains sa's + \ + + \ SA header size when link is in dn_bonus + d# 16 constant /sahdr-link + + : sa_props ( sa -- n ) h# 4 + w@ ; + + : sa-hdrsz ( sa -- sz ) sa_props h# 7 >> ; + + alias >sa dn_bonus + + : >sadata ( dn -- adr ) >sa dup sa-hdrsz + ; + : sa-mode ( dn -- n ) >sadata x@ ; + : sa-fsize ( dn -- n ) >sadata h# 8 + x@ ; + : sa-parent ( dn -- n ) >sadata h# 28 + x@ ; + + \ copy symlink target to dst + : sa-readlink ( dst dn -- ) + dup >sa sa-hdrsz /sahdr-link <> if + \ contents in 1st attr of dn_spill + temp-space over dn_spill ( dst dn t-adr bp ) + dup bp-lsize swap read-bp ( dst dn ) + sa-fsize ( dst size ) + temp-space dup sa-hdrsz + ( dst size src ) + else ( dst dn ) + \ content in bonus buf + dup dn_bonus over dn_bonuslen + ( dst dn ebonus ) + swap sa-fsize tuck - ( dst size src ) + then ( dst size src ) + -rot move ( ) + ; + + + \ setup attr routines for dn + : set-attr ( dn -- ) + dn_bonustype ot-sa# = if + ['] sa-fsize to fsize + ['] sa-mode to mode + ['] sa-parent to parent + ['] sa-readlink to readlink + else + ['] zn-fsize to fsize + ['] zn-mode to mode + ['] zn-parent to parent + ['] zn-readlink to readlink + then + ; + + : ftype ( dn -- type ) mode h# f000 and ; : dir? ( dn -- flag ) ftype h# 4000 = ; : symlink? ( dn -- flag ) ftype h# a000 = ; @@ -959,7 +1038,7 @@ then 2dup " .." $= if - 2drop >znode zp_parent ( obj# ) + 2drop parent ( obj# ) else ( dn file$ ) \ search dir current-obj# to search-obj# @@ -967,38 +1046,32 @@ true exit ( not-found ) then ( obj# ) then ( obj# ) - get-fs-dnode false ( found ) + get-fs-dnode + dnode set-attr + false ( found ) ; /buf-len instance buffer: fpath-buf - : clr-fpath-buf ( -- ) fpath-buf /buf-len erase ; - - : fpath-buf$ ( -- path$ ) fpath-buf cscount ; + /buf-len instance buffer: tpath-buf - \ copy symlink target to adr - : readlink ( dst dn -- ) - dup fsize tuck /zn-slink > if ( dst size dn ) - \ contents in 1st block - temp-space over dn-bsize ( dst size dn t-adr bsize ) - rot 0 lblk#>bp read-bp ( dst size ) - temp-space ( dst size src ) - else ( dst size dn ) - \ contents in dnode - >znode /znode + ( dst size src ) - then ( dst size src ) - -rot move ( ) - ; + : tpath-buf$ ( -- path$ ) tpath-buf cscount ; + : fpath-buf$ ( -- path$ ) fpath-buf cscount ; \ modify tail to account for symlink : follow-symlink ( tail$ -- tail$' ) - clr-fpath-buf ( tail$ ) - fpath-buf dnode readlink + \ read target + tpath-buf /buf-len erase + tpath-buf dnode readlink - \ append to current path + \ append current path ?dup if ( tail$ ) - " /" fpath-buf$ $append ( tail$ ) - fpath-buf$ $append ( ) + " /" tpath-buf$ $append ( tail$ ) + tpath-buf$ $append ( ) else drop then ( ) + + \ copy to fpath + fpath-buf /buf-len erase + tpath-buf$ fpath-buf swap move fpath-buf$ ( path$ ) \ get directory that starts changed path @@ -1008,6 +1081,7 @@ search-obj# ( path$ obj# ) then ( path$ obj# ) get-fs-dnode ( path$ ) + dnode set-attr ; \ open dnode at path @@ -1020,6 +1094,7 @@ current-obj# ( path$ obj# ) then ( path$ obj# ) get-fs-dnode ( path$ ) + dnode set-attr \ lookup each path component begin ( path$ ) @@ -1173,7 +1248,7 @@ \ zero instance buffers file-records /file-records erase - bootprop-buf /buf-len erase + bootprop-buf /buf-len erase ; : release-buffers ( -- ) diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/Makefile.files --- a/usr/src/uts/common/Makefile.files Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/Makefile.files Tue Mar 16 09:43:38 2010 -0600 @@ -1338,6 +1338,7 @@ lzjb.o \ metaslab.o \ refcount.o \ + sa.o \ sha256.o \ spa.o \ spa_config.o \ @@ -1363,6 +1364,7 @@ zfs_byteswap.o \ zfs_fm.o \ zfs_fuid.o \ + zfs_sa.o \ zfs_znode.o \ zil.o \ zio.o \ diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dbuf.c --- a/usr/src/uts/common/fs/zfs/dbuf.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Tue Mar 16 09:43:38 2010 -0600 @@ -34,6 +34,8 @@ #include #include #include +#include +#include static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); @@ -296,13 +298,17 @@ ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DB_BONUS_BLKID || - list_head(&dn->dn_dbufs)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == + DMU_SPILL_BLKID || list_head(&dn->dn_dbufs)); } - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); + ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); + } else if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, 0); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } @@ -336,8 +342,9 @@ ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; @@ -357,7 +364,7 @@ } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && - db->db.db_data && db->db_blkid != DB_BONUS_BLKID && + db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, @@ -465,7 +472,7 @@ dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); db->db_state = DB_UNCACHED; @@ -490,7 +497,7 @@ ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); @@ -570,7 +577,7 @@ if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && DBUF_IS_CACHEABLE(db); @@ -630,7 +637,7 @@ dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); @@ -675,7 +682,7 @@ if (dr == NULL || (dr->dt.dl.dr_data != - ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* @@ -686,7 +693,7 @@ * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); @@ -713,7 +720,7 @@ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); - if (db->db_blkid == DB_BONUS_BLKID || + if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; @@ -751,7 +758,7 @@ uint64_t first_l1 = start >> epbs; uint64_t last_l1 = end >> epbs; - if (end > dn->dn_maxblkid) { + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { end = dn->dn_maxblkid; last_l1 = end >> epbs; } @@ -759,7 +766,7 @@ mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level == 1 && db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { @@ -873,7 +880,7 @@ int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); @@ -970,6 +977,9 @@ } mutex_exit(&dn->dn_mtx); + if (db->db_blkid == DMU_SPILL_BLKID) + dn->dn_have_spill = B_TRUE; + /* * If this buffer is already dirty, we're done. */ @@ -979,7 +989,7 @@ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. @@ -1020,7 +1030,7 @@ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop @@ -1042,7 +1052,7 @@ void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { @@ -1078,7 +1088,8 @@ * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); mutex_exit(&dn->dn_mtx); @@ -1094,7 +1105,8 @@ mutex_exit(&db->db_mtx); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); @@ -1182,7 +1194,7 @@ dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); /* @@ -1297,7 +1309,7 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); @@ -1319,7 +1331,7 @@ if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); @@ -1340,7 +1352,7 @@ { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ASSERT(buf != NULL); @@ -1423,7 +1435,7 @@ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } @@ -1437,7 +1449,7 @@ db->db_state = DB_EVICTING; db->db_blkptr = NULL; - if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); dnode_rele(dn, db); db->db_dnode = NULL; @@ -1466,7 +1478,19 @@ *parentp = NULL; *bpp = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); + + if (blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + *bpp = &dn->dn_phys->dn_spill; + else + *bpp = NULL; + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + mutex_exit(&dn->dn_mtx); + return (0); + } if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; @@ -1539,16 +1563,20 @@ db->db_immediate_evict = 0; db->db_freed_in_flight = 0; - if (blkid == DB_BONUS_BLKID) { + if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - db->db.db_offset = DB_BONUS_BLKID; + db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); + } else if (blkid == DMU_SPILL_BLKID) { + db->db.db_size = (blkptr != NULL) ? + BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; + db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; @@ -1616,7 +1644,7 @@ { ASSERT(refcount_is_zero(&db->db_holds)); - if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != DMU_BONUS_BLKID) { /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. @@ -1652,7 +1680,7 @@ dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (dnode_block_freed(dn, blkid)) @@ -1708,7 +1736,7 @@ { dmu_buf_impl_t *db, *parent = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); @@ -1757,7 +1785,7 @@ * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; @@ -1812,7 +1840,34 @@ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); +} + +int +dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + if (db->db_blkid != DMU_SPILL_BLKID) + return (ENOTSUP); + if (blksz == 0) + blksz = SPA_MINBLOCKSIZE; + if (blksz > SPA_MAXBLOCKSIZE) + blksz = SPA_MAXBLOCKSIZE; + else + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_WRITER); + dbuf_new_size(db, blksz, tx); + rw_exit(&db->db_dnode->dn_struct_rwlock); + + return (0); +} + +void +dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); + dnode_rm_spill(dn, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref @@ -1858,7 +1913,7 @@ dbuf_evict_user(db); if (holds == 0) { - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { mutex_exit(&db->db_mtx); dnode_rele(db->db_dnode, db); } else if (db->db_buf == NULL) { @@ -1971,6 +2026,11 @@ if (db->db_blkptr != NULL) return; + if (db->db_blkid == DMU_SPILL_BLKID) { + db->db_blkptr = &dn->dn_phys->dn_spill; + BP_ZERO(db->db_blkptr); + return; + } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was @@ -2071,13 +2131,19 @@ } DBUF_VERIFY(db); + if (db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); @@ -2204,14 +2270,27 @@ return; } - ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); mutex_enter(&db->db_mtx); +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn = db->db_dnode; + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + } +#endif + if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid) + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); @@ -2278,8 +2357,17 @@ ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn = db->db_dnode; + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + } +#endif + if (db->db_level == 0) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) @@ -2362,6 +2450,7 @@ zbookmark_t zb; zio_prop_t zp; zio_t *zio; + int wp_flag = 0; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { @@ -2385,9 +2474,12 @@ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { - ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); + ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid != DMU_SPILL_BLKID) || + (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } @@ -2399,8 +2491,11 @@ os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); - dmu_write_policy(os, dn, db->db_level, - db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp); + if (db->db_blkid == DMU_SPILL_BLKID) + wp_flag = WP_SPILL; + wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + + dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ASSERT(db->db_state != DB_NOFILL); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dmu.c --- a/usr/src/uts/common/fs/zfs/dmu.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu.c Tue Mar 16 09:43:38 2010 -0600 @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -90,7 +91,10 @@ { zap_byteswap, TRUE, "snapshot refcount tags"}, { zap_byteswap, TRUE, "DDT ZAP algorithm" }, { zap_byteswap, TRUE, "DDT statistics" }, -}; + { byteswap_uint8_array, TRUE, "System attributes" }, + { zap_byteswap, TRUE, "SA master node" }, + { zap_byteswap, TRUE, "SA attr registration" }, + { zap_byteswap, TRUE, "SA attr layouts" }, }; int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, @@ -142,6 +146,33 @@ return (0); } +int +dmu_set_bonustype(dmu_buf_t *db, dmu_object_type_t type, dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + if (type > DMU_OT_NUMTYPES) + return (EINVAL); + + if (dn->dn_bonus != (dmu_buf_impl_t *)db) + return (EINVAL); + + dnode_setbonus_type(dn, type, tx); + return (0); +} + +int +dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + dbuf_rm_spill(dn, tx); + dnode_rele(dn, FTAG); + return (error); +} + /* * returns ENOENT, EIO, or 0. */ @@ -179,6 +210,61 @@ } /* + * returns ENOENT, EIO, or 0. + * + * This interface will allocate a blank spill dbuf when a spill blk + * doesn't already exist on the dnode. + * + * if you only want to find an already existing spill db, then + * dmu_spill_hold_existing() should be used. + */ +int +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = NULL; + int err; + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + + ASSERT(db != NULL); + err = dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | flags); + *dbp = &db->db; + return (err); +} + +int +dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn = ((dmu_buf_impl_t *)bonus)->db_dnode; + int err; + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) + return (EINVAL); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + rw_exit(&dn->dn_struct_rwlock); + return (ENOENT); + } + err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT, tag, dbp); + rw_exit(&dn->dn_struct_rwlock); + return (err); +} + +int +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + return (dmu_spill_hold_by_dnode(((dmu_buf_impl_t *)bonus)->db_dnode, + 0, tag, dbp)); +} + +/* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files @@ -1349,7 +1435,7 @@ zp->zp_checksum = checksum; zp->zp_compress = compress; - zp->zp_type = type; + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; @@ -1514,6 +1600,7 @@ arc_init(); l2arc_init(); xuio_stat_init(); + sa_cache_init(); } void @@ -1525,4 +1612,5 @@ dbuf_fini(); l2arc_fini(); xuio_stat_fini(); + sa_cache_fini(); } diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dmu_objset.c --- a/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Mar 16 09:43:38 2010 -0600 @@ -41,6 +41,7 @@ #include #include #include +#include spa_t * dmu_objset_spa(objset_t *os) @@ -500,6 +501,9 @@ secondary_cache_changed_cb, os)); } + if (os->os_sa) + sa_tear_down(os); + /* * We should need only a single pass over the dnode list, since * nothing can be added to the list at this point. @@ -1066,20 +1070,11 @@ } static void -do_userquota_callback(objset_t *os, dnode_phys_t *dnp, - boolean_t subtract, dmu_tx_t *tx) +do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) { - static const char zerobuf[DN_MAX_BONUSLEN] = {0}; - uint64_t user, group; - - ASSERT(dnp->dn_type != 0 || - (bcmp(DN_BONUS(dnp), zerobuf, DN_MAX_BONUSLEN) == 0 && - DN_USED_BYTES(dnp) == 0)); - - if ((dnp->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) && - 0 == used_cbs[os->os_phys->os_type](dnp->dn_bonustype, - DN_BONUS(dnp), &user, &group)) { - int64_t delta = DNODE_SIZE + DN_USED_BYTES(dnp); + if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { + int64_t delta = DNODE_SIZE + used; if (subtract) delta = -delta; VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, @@ -1090,7 +1085,7 @@ } void -dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx) +dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; @@ -1099,7 +1094,6 @@ while (dn = list_head(list)) { ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); - ASSERT(dn->dn_oldphys); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); @@ -1116,20 +1110,39 @@ /* * We intentionally modify the zap object even if the - * net delta (due to phys-oldphys) is zero. Otherwise + * net delta is zero. Otherwise * the block of the zap obj could be shared between * datasets but need to be different between them after * a bprewrite. */ - do_userquota_callback(os, dn->dn_oldphys, B_TRUE, tx); - do_userquota_callback(os, dn->dn_phys, B_FALSE, tx); /* * The mutex is needed here for interlock with dnode_allocate. */ mutex_enter(&dn->dn_mtx); - zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t)); - dn->dn_oldphys = NULL; + ASSERT(dn->dn_id_flags); + if (dn->dn_id_flags & DN_ID_OLD_EXIST) { + do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); + } + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), + dn->dn_phys->dn_flags, dn->dn_newuid, + dn->dn_newgid, B_FALSE, tx); + } + + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + dn->dn_olduid = dn->dn_newuid; + dn->dn_oldgid = dn->dn_newgid; + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (dn->dn_bonuslen == 0) + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + else + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + dn->dn_id_flags &= ~(DN_ID_NEW_EXIST|DN_ID_SYNC); mutex_exit(&dn->dn_mtx); list_remove(list, dn); @@ -1137,6 +1150,71 @@ } } +void +dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before) +{ + objset_t *os = dn->dn_objset; + void *data = NULL; + dmu_buf_t *spilldb = NULL; + uint64_t *user, *group; + int flags = dn->dn_id_flags; + int error; + + if (!dmu_objset_userused_enabled(dn->dn_objset)) + return; + + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| + DN_ID_CHKED_SPILL))) + return; + + if (before && dn->dn_bonuslen != 0) + data = DN_BONUS(dn->dn_phys); + else if (!before && dn->dn_bonuslen != 0) + data = dn->dn_bonus != NULL ? + dn->dn_bonus->db.db_data : DN_BONUS(dn->dn_phys); + else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { + int rf = 0; + + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + error = dmu_spill_hold_by_dnode(dn, rf, FTAG, &spilldb); + ASSERT(error == 0); + data = spilldb->db_data; + } else { + mutex_enter(&dn->dn_mtx); + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + mutex_exit(&dn->dn_mtx); + return; + } + + if (before) { + user = &dn->dn_olduid; + group = &dn->dn_oldgid; + } else { + user = &dn->dn_newuid; + group = &dn->dn_newgid; + } + + ASSERT(data); + error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, + user, group); + + mutex_enter(&dn->dn_mtx); + if (error == 0 && before) + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (error == 0 && !before) + dn->dn_id_flags |= DN_ID_NEW_EXIST; + + if (spilldb) { + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + } else { + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + mutex_exit(&dn->dn_mtx); + if (spilldb) + dmu_buf_rele(spilldb, FTAG); +} + boolean_t dmu_objset_userspace_present(objset_t *os) { diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dmu_send.c --- a/usr/src/uts/common/fs/zfs/dmu_send.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_send.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -183,6 +183,31 @@ } static int +dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) +{ + struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); + + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + + /* write a SPILL record */ + bzero(ba->drr, sizeof (dmu_replay_record_t)); + ba->drr->drr_type = DRR_SPILL; + drrs->drr_object = object; + drrs->drr_length = blksz; + drrs->drr_toguid = ba->toguid; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + return (EINTR); + if (dump_bytes(ba, data, blksz)) + return (EINTR); + return (0); +} + +static int dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); @@ -319,6 +344,18 @@ break; } (void) arc_buf_remove_ref(abuf, &abuf); + } else if (type == DMU_OT_SA) { + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + int blksz = BP_GET_LSIZE(bp); + + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); } else { /* it's a level-0 block of a regular object */ uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; @@ -908,6 +945,11 @@ DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); + break; case DRR_END: DO64(drr_end.drr_checksum.zc_word[0]); DO64(drr_end.drr_checksum.zc_word[1]); @@ -969,8 +1011,9 @@ drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen); } - if (err) + if (err) { return (EINVAL); + } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, drro->drr_object); @@ -1121,6 +1164,56 @@ return (0); } +static int +restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) +{ + dmu_tx_t *tx; + void *data; + dmu_buf_t *db, *db_spill; + int err; + + if (drrs->drr_length < SPA_MINBLOCKSIZE || + drrs->drr_length > SPA_MAXBLOCKSIZE) + return (EINVAL); + + data = restore_read(ra, drrs->drr_length); + if (data == NULL) + return (ra->err); + + if (dmu_object_info(os, drrs->drr_object, NULL) != 0) + return (EINVAL); + + VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); + if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_spill(tx, db->db_object); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_tx_abort(tx); + return (err); + } + dmu_buf_will_dirty(db_spill, tx); + + if (db_spill->db_size < drrs->drr_length) + VERIFY(0 == dbuf_spill_set_blksz(db_spill, + drrs->drr_length, tx)); + bcopy(data, db_spill->db_data, drrs->drr_length); + + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + + dmu_tx_commit(tx); + return (0); +} + /* ARGSUSED */ static int restore_free(struct restorearg *ra, objset_t *os, @@ -1276,6 +1369,12 @@ ra.err = ECKSUM; goto out; } + case DRR_SPILL: + { + struct drr_spill drrs = drr->drr_u.drr_spill; + ra.err = restore_spill(&ra, os, &drrs); + break; + } default: ra.err = EINVAL; goto out; diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dmu_traverse.c --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Tue Mar 16 09:43:38 2010 -0600 @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include struct prefetch_data { @@ -273,6 +275,17 @@ break; lasterr = err; } + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, + object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_spill, &czb); + if (err) { + if (!hard) + break; + lasterr = err; + } + } } return (err != 0 ? err : lasterr); } diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dmu_tx.c --- a/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Tue Mar 16 09:43:38 2010 -0600 @@ -33,7 +33,10 @@ #include #include /* for fzap_default_block_shift */ #include +#include +#include #include +#include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); @@ -813,10 +816,11 @@ match_offset = TRUE; /* * We will let this hold work for the bonus - * buffer so that we don't need to hold it - * when creating a new object. + * or spill buffer so that we don't need to + * hold it when creating a new object. */ - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, @@ -837,8 +841,12 @@ txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; case THT_BONUS: - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: @@ -1204,3 +1212,141 @@ kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + int i; + + if (!sa->sa_need_attr_registration) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dnode_t *dn; + dmu_tx_hold_t *txh; + blkptr_t *bp; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + + dn = txh->txh_dnode; + + if (dn == NULL) + return; + + /* If blkptr doesn't exist then add space to towrite */ + bp = &dn->dn_phys->dn_spill; + if (BP_IS_HOLE(bp)) { + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref = 0; + } else { + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + bp->blk_birth)) + txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + else + txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + if (bp->blk_birth) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; + } +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_tx_hold_bonus(tx, object); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill || + ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } +} diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dnode.c --- a/usr/src/uts/common/fs/zfs/dnode.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dnode.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -210,6 +210,11 @@ ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); } + + /* Swap SPILL block if we have one */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); + } void @@ -258,6 +263,28 @@ rw_exit(&dn->dn_struct_rwlock); } +void +dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dn->dn_bonustype = newtype; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; + dn->dn_have_spill = B_FALSE; + rw_exit(&dn->dn_struct_rwlock); +} + static void dnode_setdblksz(dnode_t *dn, int size) { @@ -294,6 +321,7 @@ dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_maxblkid = dnp->dn_maxblkid; + dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dmu_zfetch_init(&dn->dn_zfetch, dn); @@ -321,7 +349,7 @@ } ASSERT(NULL == list_head(&dn->dn_dbufs)); #endif - ASSERT(dn->dn_oldphys == NULL); + ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); list_remove(&os->os_dnodes, dn); @@ -368,6 +396,7 @@ ASSERT(ot != DMU_OT_NONE); ASSERT3U(ot, <, DMU_OT_NUMTYPES); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); @@ -383,6 +412,8 @@ ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); + ASSERT3U(dn->dn_next_bonustype[i], ==, 0); + ASSERT3U(dn->dn_rm_spillblk[i], ==, 0); ASSERT3U(dn->dn_next_blksz[i], ==, 0); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); @@ -393,7 +424,11 @@ dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; - dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + dn->dn_nblkptr = 1; + else + dn->dn_nblkptr = 1 + + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; @@ -407,10 +442,12 @@ } dn->dn_allocated_txg = tx->tx_txg; + dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } @@ -446,8 +483,14 @@ if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (dn->dn_bonustype != bonustype) + dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; + dn->dn_have_spill = B_FALSE; + } rw_exit(&dn->dn_struct_rwlock); /* change type */ @@ -627,11 +670,15 @@ if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ((flag & DNODE_MUST_BE_FREE) && - (type != DMU_OT_NONE || dn->dn_oldphys))) { + (type != DMU_OT_NONE || (dn->dn_id_flags & DN_ID_SYNC)))) { mutex_exit(&dn->dn_mtx); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } + if (flag & DNODE_MUST_BE_FREE) { + ASSERT(refcount_is_zero(&dn->dn_holds)); + ASSERT(!(dn->dn_id_flags & DN_ID_SYNC)); + } mutex_exit(&dn->dn_mtx); if (refcount_add(&dn->dn_holds, tag) == 1) @@ -706,6 +753,11 @@ mutex_exit(&dn->dn_mtx); #endif + /* + * Determine old uid/gid when necessary + */ + dmu_objset_userquota_get_ids(dn, B_TRUE); + mutex_enter(&os->os_lock); /* @@ -720,6 +772,7 @@ ASSERT(dn->dn_datablksz != 0); ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); + ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", dn->dn_object, txg); @@ -814,7 +867,8 @@ for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } @@ -858,7 +912,7 @@ int epbs, new_nlevels; uint64_t sz; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : @@ -915,7 +969,8 @@ for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); if (dr->dr_dbuf->db_level != new_nlevels-1 && - dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) { + dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); @@ -1170,6 +1225,20 @@ rw_exit(&dn->dn_struct_rwlock); } +static boolean_t +dnode_spill_freed(dnode_t *dn) +{ + int i; + + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) @@ -1178,7 +1247,7 @@ void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) return (FALSE); /* @@ -1191,6 +1260,9 @@ if (dn->dn_free_txg) return (TRUE); + if (blkid == DMU_SPILL_BLKID) + return (dnode_spill_freed(dn)); + range_tofind.fr_blkid = blkid; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dnode_sync.c --- a/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -434,7 +434,7 @@ db->db_last_dirty = NULL; db->db_dirtycnt -= 1; if (db->db_level == 0) { - ASSERT(db->db_blkid == DB_BONUS_BLKID || + ASSERT(db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); } @@ -490,6 +490,7 @@ dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; + dn->dn_have_spill = B_FALSE; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -512,6 +513,7 @@ int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; static const dnode_phys_t zerodn = { 0 }; + boolean_t kill_spill = B_FALSE; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); @@ -523,10 +525,13 @@ if (dmu_objset_userused_enabled(dn->dn_objset) && !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - ASSERT(dn->dn_oldphys == NULL); - dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t)); - *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */ + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); + dn->dn_oldflags = dn->dn_phys->dn_flags; + dn->dn_id_flags |= DN_ID_SYNC; dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + mutex_exit(&dn->dn_mtx); + dmu_objset_userquota_get_ids(dn, B_FALSE); } else { /* Once we account for it, we should always account for it. */ ASSERT(!(dn->dn_phys->dn_flags & @@ -573,6 +578,24 @@ dn->dn_next_bonuslen[txgoff] = 0; } + if (dn->dn_next_bonustype[txgoff]) { + ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES); + dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; + dn->dn_next_bonustype[txgoff] = 0; + } + + /* + * We will either remove a spill block when a file is being removed + * or we have been asked to remove it. + */ + if (dn->dn_rm_spillblk[txgoff] || + ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && + dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) { + if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + kill_spill = B_TRUE; + dn->dn_rm_spillblk[txgoff] = 0; + } + if (dn->dn_next_indblkshift[txgoff]) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; @@ -589,6 +612,21 @@ mutex_exit(&dn->dn_mtx); + if (kill_spill) { + dmu_buf_impl_t *spilldb; + (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); + mutex_enter(&dn->dn_mtx); + dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + spilldb = dbuf_find(dn, 0, DMU_SPILL_BLKID); + if (spilldb) { + spilldb->db_blkptr = NULL; + mutex_exit(&spilldb->db_mtx); + } + rw_exit(&dn->dn_struct_rwlock); + } + /* process all the "freed" ranges in the file */ while (rp = avl_last(&dn->dn_ranges[txgoff])) { dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dsl_pool.c --- a/usr/src/uts/common/fs/zfs/dsl_pool.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Tue Mar 16 09:43:38 2010 -0600 @@ -343,7 +343,7 @@ for (ds = list_head(&dp->dp_synced_datasets); ds; ds = list_next(&dp->dp_synced_datasets, ds)) - dmu_objset_do_userquota_callbacks(ds->ds_objset, tx); + dmu_objset_do_userquota_updates(ds->ds_objset, tx); /* * Sync the datasets again to push out the changes due to diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/dsl_scrub.c --- a/usr/src/uts/common/fs/zfs/dsl_scrub.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c Tue Mar 16 09:43:38 2010 -0600 @@ -42,6 +42,8 @@ #include #include #include +#include +#include typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); @@ -612,6 +614,12 @@ SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_t czb; + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + scrub_visitbp(dp, dnp, buf, &dnp->dn_spill, &czb); + } } } diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sa.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sa.c Tue Mar 16 09:43:38 2010 -0600 @@ -0,0 +1,1887 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS System attributes: + * + * A generic mechanism to allow for arbitrary attributes + * to be stored in a dnode. The data will be stored in the bonus buffer of + * the dnode and if necessary a special "spill" block will be used to handle + * overflow situations. The spill block will be sized to fit the data + * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the + * spill block is stored at the end of the current bonus buffer. Any + * attributes that would be in the way of the blkptr_t will be relocated + * into the spill block. + * + * Attribute registration: + * + * Stored persistently on a per dataset basis + * a mapping between attribute "string" names and their actual attribute + * numeric values, length, and byteswap function. The names are only used + * during registration. All attributes are known by their unique attribute + * id value. If an attribute can have a variable size then the value + * 0 will be used to indicate this. + * + * Attribute Layout: + * + * Attribute layouts are a way to compactly store multiple attributes, but + * without taking the overhead associated with managing each attribute + * individually. Since you will typically have the same set of attributes + * stored in the same order a single table will be used to represent that + * layout. The ZPL for example will usually have only about 10 different + * layouts (regular files, device files, symlinks, + * regular files + scanstamp, files/dir with extended attributes, and then + * you have the possibility of all of those minus ACL, because it would + * be kicked out into the spill block) + * + * Layouts are simply an array of the attributes and their + * ordering i.e. [0, 1, 4, 5, 2] + * + * Each distinct layout is given a unique layout number and that is whats + * stored in the header at the beginning of the SA data buffer. + * + * A layout only covers a single dbuf (bonus or spill). If a set of + * attributes is split up between the bonus buffer and a spill buffer then + * two different layouts will be used. This allows us to byteswap the + * spill without looking at the bonus buffer and keeps the on disk format of + * the bonus and spill buffer the same. + * + * Adding a single attribute will cause the entire set of attributes to + * be rewritten and could result in a new layout number being constructed + * as part of the rewrite if no such layout exists for the new set of + * attribues. The new attribute will be appended to the end of the already + * existing attributes. + * + * Both the attribute registration and attribute layout information are + * stored in normal ZAP attributes. Their should be a small number of + * known layouts and the set of attributes is assumed to typically be quite + * small. + * + * The registered attributes and layout "table" information is maintained + * in core and a special "sa_os_t" is attached to the objset_t. + * + * A special interface is provided to allow for quickly applying + * a large set of attributes at once. sa_replace_all_by_template() is + * used to set an array of attributes. This is used by the ZPL when + * creating a brand new file. The template that is passed into the function + * specifies the attribute, size for variable length attributes, location of + * data and special "data locator" function if the data isn't in a contiguous + * location. + * + * Byteswap implications: + * Since the SA attributes are not entirely self describing we can't do + * the normal byteswap processing. The special ZAP layout attribute and + * attribute registration attributes define the byteswap function and the + * size of the attributes, unless it is variable sized. + * The normal ZFS byteswapping infrastructure assumes you don't need + * to read any objects in order to do the necessary byteswapping. Whereas + * SA attributes can only be properly byteswapped if the dataset is opened + * and the layout/attribute ZAP attributes are available. Because of this + * the SA attributes will be byteswapped when they are first accessed by + * the SA code that will read the SA data. + */ + +typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, + uint16_t length, int length_idx, boolean_t, void *userp); + +static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); +static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); +static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, + void *data); +static void sa_idx_tab_rele(objset_t *os, void *arg); +static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, + int buflen); +static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx); + +arc_byteswap_func_t *sa_bswap_table[] = { + byteswap_uint64_array, + byteswap_uint32_array, + byteswap_uint16_array, + byteswap_uint8_array, + zfs_acl_byteswap, +}; + +#define SA_COPY_DATA(f, s, t, l) \ + { \ + if (f == NULL) { \ + if (l == 8) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + } else if (l == 16) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + *(uint64_t *)((uintptr_t)t + 8) = \ + *(uint64_t *)((uintptr_t)s + 8); \ + } else { \ + bcopy(s, t, l); \ + } \ + } else \ + sa_copy_data(f, s, t, l); \ + } + +/* + * This table is fixed and cannot be changed. Its purpose is to + * allow the SA code to work with both old/new ZPL file systems. + * It contains the list of legacy attributes. These attributes aren't + * stored in the "attribute" registry zap objects, since older ZPL file systems + * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will + * use this static table. + */ +sa_attr_reg_t sa_legacy_attrs[] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, +}; + +/* + * ZPL legacy layout + * This is only used for objects of type DMU_OT_ZNODE + */ +sa_attr_type_t sa_legacy_zpl_layout[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +/* + * Special dummy layout used for buffers with no attributes. + */ + +sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; + +static int sa_legacy_attr_count = 16; +static kmem_cache_t *sa_cache = NULL; + +/*ARGSUSED*/ +static int +sa_cache_constructor(void *buf, void *unused, int kmflag) +{ + sa_handle_t *hdl = buf; + + hdl->sa_bonus_tab = NULL; + hdl->sa_spill_tab = NULL; + hdl->sa_os = NULL; + hdl->sa_userp = NULL; + hdl->sa_bonus = NULL; + hdl->sa_spill = NULL; + mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/*ARGSUSED*/ +static void +sa_cache_destructor(void *buf, void *unused) +{ + sa_handle_t *hdl = buf; + mutex_destroy(&hdl->sa_lock); +} + +void +sa_cache_init(void) +{ + sa_cache = kmem_cache_create("sa_cache", + sizeof (sa_handle_t), 0, sa_cache_constructor, + sa_cache_destructor, NULL, NULL, NULL, 0); +} + +void +sa_cache_fini(void) +{ + if (sa_cache) + kmem_cache_destroy(sa_cache); +} + +static int +layout_num_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = arg1; + const sa_lot_t *node2 = arg2; + + if (node1->lot_num > node2->lot_num) + return (1); + else if (node1->lot_num < node2->lot_num) + return (-1); + return (0); +} + +static int +layout_hash_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = arg1; + const sa_lot_t *node2 = arg2; + + if (node1->lot_hash > node2->lot_hash) + return (1); + if (node1->lot_hash < node2->lot_hash) + return (-1); + if (node1->lot_instance > node2->lot_instance) + return (1); + if (node1->lot_instance < node2->lot_instance) + return (-1); + return (0); +} + +boolean_t +sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) +{ + int i; + + if (count != tbf->lot_attr_count) + return (1); + + for (i = 0; i != count; i++) { + if (attrs[i] != tbf->lot_attrs[i]) + return (1); + } + return (0); +} + +#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) + +static uint64_t +sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) +{ + int i; + uint64_t crc = -1ULL; + + for (i = 0; i != attr_count; i++) + crc ^= SA_ATTR_HASH(attrs[i]); + + return (crc); +} + +static boolean_t +sa_has_blkptr(sa_handle_t *hdl) +{ + int rc; + if (hdl->sa_spill == NULL) { + if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, + &hdl->sa_spill)) == 0) + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } else { + rc = 0; + } + + return (rc == 0 ? B_TRUE : B_FALSE); +} + +/* + * Main attribute lookup/update function + * returns 0 for success or non zero for failures + * + * Operates on bulk array, first failure will abort further processing + */ +int +sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + sa_data_op_t data_op, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + int i; + int error = 0; + sa_buf_type_t buftypes; + + buftypes = 0; + + ASSERT(count > 0); + for (i = 0; i != count; i++) { + ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); + + bulk[i].sa_addr = NULL; + /* First check the bonus buffer */ + + if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( + hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_bonus_tab, + SA_GET_HDR(hdl, SA_BONUS), + bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); + if (tx && !(buftypes & SA_BONUS)) { + dmu_buf_will_dirty(hdl->sa_bonus, tx); + buftypes |= SA_BONUS; + } + } + if (bulk[i].sa_addr == NULL && sa_has_blkptr(hdl)) { + if (TOC_ATTR_PRESENT( + hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_spill_tab, + SA_GET_HDR(hdl, SA_SPILL), + bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); + if (tx && !(buftypes & SA_SPILL) && + bulk[i].sa_size == bulk[i].sa_length) { + dmu_buf_will_dirty(hdl->sa_spill, tx); + buftypes |= SA_SPILL; + } + } + } + switch (data_op) { + case SA_LOOKUP: + if (bulk[i].sa_addr == NULL) + return (ENOENT); + if (bulk[i].sa_data) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_addr, bulk[i].sa_data, + bulk[i].sa_size); + } + continue; + + case SA_UPDATE: + /* existing rewrite of attr */ + if (bulk[i].sa_addr && + bulk[i].sa_size == bulk[i].sa_length) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_addr, + bulk[i].sa_length); + continue; + } else if (bulk[i].sa_addr) { /* attr size change */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_REPLACE, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } else { /* adding new attribute */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_ADD, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } + if (error) + return (error); + break; + } + } + return (error); +} + +static sa_lot_t * +sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, + uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, *findtb; + int i; + avl_index_t loc; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); + tb->lot_attr_count = attr_count; + tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + tb->lot_num = lot_num; + tb->lot_hash = hash; + tb->lot_instance = 0; + + if (zapadd) { + char attr_name[8]; + + if (sa->sa_layout_attr_obj == 0) { + int error; + sa->sa_layout_attr_obj = zap_create(os, + DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx); + error = zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1, + &sa->sa_layout_attr_obj, tx); + ASSERT3U(error, ==, 0); + } + + (void) snprintf(attr_name, sizeof (attr_name), + "%d", (int)lot_num); + VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, + attr_name, 2, attr_count, attrs, tx)); + } + + list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), + offsetof(sa_idx_tab_t, sa_next)); + + for (i = 0; i != attr_count; i++) { + if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) + tb->lot_var_sizes++; + } + + avl_add(&sa->sa_layout_num_tree, tb); + + /* verify we don't have a hash collision */ + if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { + for (; findtb && findtb->lot_hash == hash; + findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { + if (findtb->lot_instance != tb->lot_instance) + break; + tb->lot_instance++; + } + } + avl_add(&sa->sa_layout_hash_tree, tb); + return (tb); +} + +static void +sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, + int count, dmu_tx_t *tx, sa_lot_t **lot) +{ + sa_lot_t *tb, tbsearch; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + boolean_t found = B_FALSE; + + mutex_enter(&sa->sa_lock); + tbsearch.lot_hash = hash; + tbsearch.lot_instance = 0; + tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); + if (tb) { + for (; tb && tb->lot_hash == hash; + tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { + if (sa_layout_equal(tb, attrs, count) == 0) { + found = B_TRUE; + break; + } + } + } + if (!found) { + tb = sa_add_layout_entry(os, attrs, count, + avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); + } + mutex_exit(&sa->sa_lock); + *lot = tb; +} + +static int +sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) +{ + int error; + uint32_t blocksize; + + if (size == 0) { + blocksize = SPA_MINBLOCKSIZE; + } else if (size > SPA_MAXBLOCKSIZE) { + ASSERT(0); + return (EFBIG); + } else { + blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); + } + + error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); + ASSERT(error == 0); + return (error); +} + +static void +sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) +{ + if (func == NULL) { + bcopy(datastart, target, buflen); + } else { + boolean_t start; + int bytes; + void *dataptr; + void *saptr = target; + uint32_t length; + + start = B_TRUE; + bytes = 0; + while (bytes < buflen) { + func(&dataptr, &length, buflen, start, datastart); + bcopy(dataptr, saptr, length); + saptr = (void *)((caddr_t)saptr + length); + bytes += length; + start = B_FALSE; + } + } +} + +/* + * Determine several different sizes + * first the sa header size + * the number of bytes to be stored + * if spill would occur the index in the attribute array is returned + * + * the boolean will_spill will be set when spilling is necessary. It + * is only set when the buftype is SA_BONUS + */ +static int +sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, + boolean_t *will_spill) +{ + int var_size = 0; + int i; + int full_space; + int hdrsize; + boolean_t done = B_FALSE; + + if (buftype == SA_BONUS && sa->sa_force_spill) { + *total = 0; + *index = 0; + *will_spill = B_TRUE; + return (0); + } + + *index = -1; + *total = 0; + + if (buftype == SA_BONUS) + *will_spill = B_FALSE; + + hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : + sizeof (sa_hdr_phys_t); + + full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; + + for (i = 0; i != attr_count; i++) { + boolean_t is_var_sz; + + *total += attr_desc[i].sa_length; + if (done) + goto next; + + is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); + if (is_var_sz) { + var_size++; + } + + if (is_var_sz && var_size > 1) { + if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + + *total < full_space) { + hdrsize += sizeof (uint16_t); + } else { + done = B_TRUE; + *index = i; + if (buftype == SA_BONUS) + *will_spill = B_TRUE; + continue; + } + } + + /* + * find index of where spill *could* occur. + * Then continue to count of remainder attribute + * space. The sum is used later for sizing bonus + * and spill buffer. + */ + if (buftype == SA_BONUS && *index == -1 && + P2ROUNDUP(*total + hdrsize, 8) > + (full_space - sizeof (blkptr_t))) { + *index = i; + done = B_TRUE; + } + +next: + if (P2ROUNDUP(*total + hdrsize, 8) > full_space && + buftype == SA_BONUS) + *will_spill = B_TRUE; + } + + hdrsize = P2ROUNDUP(hdrsize, 8); + return (hdrsize); +} + +#define BUF_SPACE_NEEDED(total, header) (total + header) + +/* + * Find layout that corresponds to ordering of attributes + * If not found a new layout number is created and added to + * persistent layout tables. + */ +static int +sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + uint64_t hash; + sa_buf_type_t buftype; + sa_hdr_phys_t *sahdr; + void *data_start; + int buf_space; + sa_attr_type_t *attrs, *attrs_start; + int i, lot_count; + int hdrsize, spillhdrsize; + int used; + dmu_object_type_t bonustype; + sa_lot_t *lot; + int len_idx; + int spill_used; + boolean_t spilling; + + dmu_buf_will_dirty(hdl->sa_bonus, tx); + bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); + + /* first determine bonus header size and sum of all attributes */ + hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, + SA_BONUS, &i, &used, &spilling); + + if (used > SPA_MAXBLOCKSIZE) + return (EFBIG); + + VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? + MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : + used + hdrsize, tx)); + + ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || + bonustype == DMU_OT_SA); + + /* setup and size spill buffer when needed */ + if (spilling) { + boolean_t dummy; + + if (hdl->sa_spill == NULL) { + int error; + error = dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, + &hdl->sa_spill); + ASSERT3U(error, ==, 0); + } + dmu_buf_will_dirty(hdl->sa_spill, tx); + + spillhdrsize = sa_find_sizes(sa, &attr_desc[i], + attr_count - i, hdl->sa_spill, SA_SPILL, &i, + &spill_used, &dummy); + + if (spill_used > SPA_MAXBLOCKSIZE) + return (EFBIG); + + buf_space = hdl->sa_spill->db_size - spillhdrsize; + if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > + hdl->sa_spill->db_size) + VERIFY(0 == sa_resize_spill(hdl, + BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); + } + + /* setup starting pointers to lay down data */ + data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); + sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + buftype = SA_BONUS; + + if (spilling) + buf_space = (sa->sa_force_spill) ? + 0 : SA_BLKPTR_SPACE - hdrsize; + else + buf_space = hdl->sa_bonus->db_size - hdrsize; + + attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + lot_count = 0; + + for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { + uint16_t length; + + attrs[i] = attr_desc[i].sa_attr; + length = SA_REGISTERED_LEN(sa, attrs[i]); + if (length == 0) + length = attr_desc[i].sa_length; + + if (buf_space < length) { /* switch to spill buffer */ + ASSERT(bonustype != DMU_OT_ZNODE); + if (buftype == SA_BONUS && !sa->sa_force_spill) { + sa_find_layout(hdl->sa_os, hash, attrs_start, + lot_count, tx, &lot); + SA_SET_HDR(sahdr, lot->lot_num, hdrsize); + } + + buftype = SA_SPILL; + hash = -1ULL; + len_idx = 0; + + sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr->sa_magic = SA_MAGIC; + data_start = (void *)((uintptr_t)sahdr + + spillhdrsize); + attrs_start = &attrs[i]; + buf_space = hdl->sa_spill->db_size - spillhdrsize; + lot_count = 0; + } + hash ^= SA_ATTR_HASH(attrs[i]); + attr_desc[i].sa_addr = data_start; + attr_desc[i].sa_size = length; + SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, + data_start, length); + if (sa->sa_attr_table[attrs[i]].sa_length == 0) { + sahdr->sa_lengths[len_idx++] = length; + } + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + length), 8); + buf_space -= P2ROUNDUP(length, 8); + lot_count++; + } + + sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); + if (bonustype == DMU_OT_SA) { + SA_SET_HDR(sahdr, lot->lot_num, + buftype == SA_BONUS ? hdrsize : spillhdrsize); + } + + kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); + if (hdl->sa_bonus_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + hdl->sa_bonus_tab = NULL; + } + if (!sa->sa_force_spill) + VERIFY(0 == sa_build_index(hdl, SA_BONUS)); + if (hdl->sa_spill) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + if (!spilling) { + /* + * remove spill block that is no longer needed. + * set sa_spill_remove to prevent sa_attr_op + * from trying to retrieve spill block before its + * been removed. The flag will be cleared if/when + * the handle is destroyed recreated or + * sa_build_layouts() needs to spill again. + */ + dmu_buf_rele(hdl->sa_spill, NULL); + hdl->sa_spill = NULL; + hdl->sa_spill_tab = NULL; + VERIFY(0 == dmu_rm_spill(hdl->sa_os, + sa_handle_object(hdl), tx)); + } else { + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } + } + + return (0); +} + +static void +sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) +{ + sa_os_t *sa = os->os_sa; + uint64_t sa_attr_count = 0; + int error = 0; + uint64_t attr_value; + sa_attr_table_t *tb; + zap_cursor_t zc; + zap_attribute_t za; + int registered_count = 0; + int i; + dmu_objset_type_t ostype = dmu_objset_type(os); + + sa->sa_user_table = + kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); + sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); + + if (sa->sa_reg_attr_obj != 0) + VERIFY(zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count) == 0); + + if (ostype == DMU_OST_ZFS && sa_attr_count == 0) + sa_attr_count += sa_legacy_attr_count; + + /* Allocate attribute numbers for attributes that aren't registered */ + for (i = 0; i != count; i++) { + boolean_t found = B_FALSE; + int j; + + if (ostype == DMU_OST_ZFS) { + for (j = 0; j != sa_legacy_attr_count; j++) { + if (strcmp(reg_attrs[i].sa_name, + sa_legacy_attrs[j].sa_name) == 0) { + sa->sa_user_table[i] = + sa_legacy_attrs[j].sa_attr; + found = B_TRUE; + } + } + } + if (found) + continue; + + if (sa->sa_reg_attr_obj) + error = zap_lookup(os, sa->sa_reg_attr_obj, + reg_attrs[i].sa_name, 8, 1, &attr_value); + else + error = ENOENT; + switch (error) { + default: + case ENOENT: + sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; + sa_attr_count++; + break; + case 0: + sa->sa_user_table[i] = ATTR_NUM(attr_value); + break; + } + } + + os->os_sa->sa_num_attrs = sa_attr_count; + tb = os->os_sa->sa_attr_table = + kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); + + /* + * Attribute table is constructed from requested attribute list, + * previously foreign registered attributes, and also the legacy + * ZPL set of attributes. + */ + + if (sa->sa_reg_attr_obj) { + for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t value; + value = za.za_first_integer; + + registered_count++; + tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); + tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); + tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); + tb[ATTR_NUM(value)].sa_registered = B_TRUE; + + if (tb[ATTR_NUM(value)].sa_name) { + continue; + } + tb[ATTR_NUM(value)].sa_name = + kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); + (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, + strlen(za.za_name) +1); + } + zap_cursor_fini(&zc); + } + + if (ostype == DMU_OST_ZFS) { + for (i = 0; i != sa_legacy_attr_count; i++) { + if (tb[i].sa_name) + continue; + tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; + tb[i].sa_length = sa_legacy_attrs[i].sa_length; + tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; + tb[i].sa_registered = B_FALSE; + tb[i].sa_name = + kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, + KM_SLEEP); + (void) strlcpy(tb[i].sa_name, + sa_legacy_attrs[i].sa_name, + strlen(sa_legacy_attrs[i].sa_name) + 1); + } + } + + for (i = 0; i != count; i++) { + sa_attr_type_t attr_id; + + attr_id = sa->sa_user_table[i]; + if (tb[attr_id].sa_name) + continue; + + tb[attr_id].sa_length = reg_attrs[i].sa_length; + tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; + tb[attr_id].sa_attr = attr_id; + tb[attr_id].sa_name = + kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); + (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, + strlen(reg_attrs[i].sa_name) + 1); + } + + os->os_sa->sa_need_attr_registration = + (sa_attr_count != registered_count); +} + +sa_attr_type_t * +sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count) +{ + zap_cursor_t zc; + zap_attribute_t za; + sa_os_t *sa; + dmu_objset_type_t ostype = dmu_objset_type(os); + sa_attr_type_t *tb; + + mutex_enter(&os->os_lock); + if (os->os_sa) { + mutex_enter(&os->os_sa->sa_lock); + mutex_exit(&os->os_lock); + tb = os->os_sa->sa_user_table; + mutex_exit(&os->os_sa->sa_lock); + return (tb); + } + + sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); + mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); + sa->sa_master_obj = sa_obj; + + mutex_enter(&sa->sa_lock); + mutex_exit(&os->os_lock); + avl_create(&sa->sa_layout_num_tree, layout_num_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); + avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); + + if (sa_obj) { + int error; + error = zap_lookup(os, sa_obj, SA_LAYOUTS, + 8, 1, &sa->sa_layout_attr_obj); + if (error != 0 && error != ENOENT) { + return (NULL); + } + error = zap_lookup(os, sa_obj, SA_REGISTRY, + 8, 1, &sa->sa_reg_attr_obj); + if (error != 0 && error != ENOENT) { + mutex_exit(&sa->sa_lock); + return (NULL); + } + } + + os->os_sa = sa; + sa_attr_table_setup(os, reg_attrs, count); + + if (sa->sa_layout_attr_obj != 0) { + for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + sa_attr_type_t *lot_attrs; + uint64_t lot_num; + + lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * + za.za_num_integers, KM_SLEEP); + + VERIFY(zap_lookup(os, sa->sa_layout_attr_obj, + za.za_name, 2, za.za_num_integers, lot_attrs) == 0); + VERIFY(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num) == 0); + + (void) sa_add_layout_entry(os, lot_attrs, + za.za_num_integers, lot_num, + sa_layout_info_hash(lot_attrs, + za.za_num_integers), B_FALSE, NULL); + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + } + zap_cursor_fini(&zc); + } + + /* Add special layout number for old ZNODES */ + if (ostype == DMU_OST_ZFS) { + (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, + sa_legacy_attr_count, 0, + sa_layout_info_hash(sa_legacy_zpl_layout, + sa_legacy_attr_count), B_FALSE, NULL); + + (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, + 0, B_FALSE, NULL); + } + mutex_exit(&sa->sa_lock); + return (os->os_sa->sa_user_table); +} + +void +sa_tear_down(objset_t *os) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *layout; + void *cookie; + int i; + + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + + /* Free up attr table */ + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_name) + kmem_free(sa->sa_attr_table[i].sa_name, + strlen(sa->sa_attr_table[i].sa_name) + 1); + } + + kmem_free(sa->sa_attr_table, + sizeof (sa_attr_table_t) * sa->sa_num_attrs); + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { + sa_idx_tab_t *tab; + while (tab = list_head(&layout->lot_idx_tab)) { + ASSERT(refcount_count(&tab->sa_refcount)); + sa_idx_tab_rele(os, tab); + } + } + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { + kmem_free(layout->lot_attrs, + sizeof (sa_attr_type_t) * layout->lot_attr_count); + kmem_free(layout, sizeof (sa_lot_t)); + } + + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + + kmem_free(sa, sizeof (sa_os_t)); + os->os_sa = NULL; +} + +void +sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t var_length, void *userp) +{ + sa_idx_tab_t *idx_tab = userp; + + if (var_length) { + ASSERT(idx_tab->sa_variable_lengths); + idx_tab->sa_variable_lengths[length_idx] = length; + } + TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, + (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); +} + +static void +sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, + sa_iterfunc_t func, sa_lot_t *tab, void *userp) +{ + void *data_start; + sa_lot_t *tb = tab; + sa_lot_t search; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + int i; + uint16_t *length_start; + uint8_t length_idx = 0; + + if (tab == NULL) { + search.lot_num = SA_LAYOUT_NUM(hdr, type); + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + ASSERT(tb); + } + + if (IS_SA_BONUSTYPE(type)) { + data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + + offsetof(sa_hdr_phys_t, sa_lengths) + + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); + length_start = hdr->sa_lengths; + } else { + data_start = hdr; + } + + for (i = 0; i != tb->lot_attr_count; i++) { + int attr_length, reg_length; + uint8_t idx_len; + + reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; + if (reg_length) { + attr_length = reg_length; + idx_len = 0; + } else { + attr_length = length_start[length_idx]; + idx_len = length_idx++; + } + + func(hdr, data_start, tb->lot_attrs[i], attr_length, + idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); + + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + attr_length), 8); + } +} + +/*ARGSUSED*/ +void +sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t variable_length, void *userp) +{ + sa_handle_t *hdl = userp; + sa_os_t *sa = hdl->sa_os->os_sa; + + sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); +} + +void +sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); + dmu_buf_impl_t *db; + sa_os_t *sa = hdl->sa_os->os_sa; + int num_lengths = 1; + int i; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + if (sa_hdr_phys->sa_magic == SA_MAGIC) + return; + + db = SA_GET_DB(hdl, buftype); + + if (buftype == SA_SPILL) { + arc_release(db->db_buf, NULL); + arc_buf_thaw(db->db_buf); + } + + sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); + sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); + + /* + * Determine number of variable lenghts in header + * The standard 8 byte header has one for free and a + * 16 byte header would have 4 + 1; + */ + if (SA_HDR_SIZE(sa_hdr_phys) > 8) + num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; + for (i = 0; i != num_lengths; i++) + sa_hdr_phys->sa_lengths[i] = + BSWAP_16(sa_hdr_phys->sa_lengths[i]); + + sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, + sa_byteswap_cb, NULL, hdl); + + if (buftype == SA_SPILL) + arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); +} + +static int +sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys; + dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); + dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); + sa_os_t *sa = hdl->sa_os->os_sa; + sa_idx_tab_t *idx_tab; + + sa_hdr_phys = SA_GET_HDR(hdl, buftype); + + mutex_enter(&sa->sa_lock); + + /* Do we need to byteswap? */ + + /* only check if not old znode */ + if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && + sa_hdr_phys->sa_magic != 0) { + VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); + sa_byteswap(hdl, buftype); + } + + idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); + + if (buftype == SA_BONUS) + hdl->sa_bonus_tab = idx_tab; + else + hdl->sa_spill_tab = idx_tab; + + mutex_exit(&sa->sa_lock); + return (0); +} + +/*ARGSUSED*/ +void +sa_evict(dmu_buf_t *db, void *sap) +{ + panic("evicting sa dbuf %p\n", (void *)db); +} + +static void +sa_idx_tab_rele(objset_t *os, void *arg) +{ + sa_os_t *sa = os->os_sa; + sa_idx_tab_t *idx_tab = arg; + + if (idx_tab == NULL) + return; + + mutex_enter(&sa->sa_lock); + if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { + list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); + if (idx_tab->sa_variable_lengths) + kmem_free(idx_tab->sa_variable_lengths, + sizeof (uint16_t) * + idx_tab->sa_layout->lot_var_sizes); + refcount_destroy(&idx_tab->sa_refcount); + kmem_free(idx_tab->sa_idx_tab, + sizeof (uint32_t) * sa->sa_num_attrs); + kmem_free(idx_tab, sizeof (sa_idx_tab_t)); + } + mutex_exit(&sa->sa_lock); +} + +static void +sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) +{ + sa_os_t *sa = os->os_sa; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + (void) refcount_add(&idx_tab->sa_refcount, NULL); +} + +void +sa_handle_destroy(sa_handle_t *hdl) +{ + mutex_enter(&hdl->sa_lock); + (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, + NULL, NULL, NULL); + + if (hdl->sa_bonus_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + hdl->sa_bonus_tab = NULL; + } + if (hdl->sa_spill_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + hdl->sa_spill_tab = NULL; + } + + dmu_buf_rele(hdl->sa_bonus, NULL); + + if (hdl->sa_spill) + dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + mutex_exit(&hdl->sa_lock); + + kmem_cache_free(sa_cache, hdl); +} + +int +sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + int error = 0; + dmu_object_info_t doi; + sa_handle_t *handle; + +#ifdef ZFS_DEBUG + dmu_object_info_from_db(db, &doi); + ASSERT(doi.doi_bonus_type == DMU_OT_SA || + doi.doi_bonus_type == DMU_OT_ZNODE); +#endif + /* find handle, if it exists */ + /* if one doesn't exist then create a new one, and initialize it */ + + handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; + if (handle == NULL) { + sa_handle_t *newhandle; + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + handle->sa_userp = userp; + handle->sa_bonus = db; + handle->sa_os = os; + handle->sa_spill = NULL; + + error = sa_build_index(handle, SA_BONUS); + newhandle = (hdl_type == SA_HDL_SHARED) ? + dmu_buf_set_user_ie(db, handle, + NULL, sa_evict) : NULL; + + if (newhandle != NULL) { + kmem_cache_free(sa_cache, handle); + handle = newhandle; + } + } + *handlepp = handle; + + return (error); +} + +int +sa_handle_get(objset_t *objset, uint64_t objid, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + dmu_buf_t *db; + int error; + + if (error = dmu_bonus_hold(objset, objid, NULL, &db)) + return (error); + + return (sa_handle_get_from_db(objset, db, userp, hdl_type, + handlepp)); +} + +int +sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) +{ + return (dmu_bonus_hold(objset, obj_num, tag, db)); +} + +void +sa_buf_rele(dmu_buf_t *db, void *tag) +{ + dmu_buf_rele(db, tag); +} + +int +sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); +} + +int +sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = buf; + bulk.sa_length = buflen; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_lookup_impl(hdl, &bulk, 1); + mutex_exit(&hdl->sa_lock); + return (error); +} + +#ifdef _KERNEL +int +sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + + mutex_enter(&hdl->sa_lock); + if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL) == 0) { + error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, + uio->uio_resid), UIO_READ, uio); + } else { + error = ENOENT; + } + mutex_exit(&hdl->sa_lock); + return (error); + +} +#endif + +/* + * Find an already existing TOC from given os and data + * This is a special interface to be used by the ZPL for + * finding the uid/gid/gen attributes. + */ +void * +sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data) +{ + sa_idx_tab_t *idx_tab; + sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data; + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, search; + avl_index_t loc; + + /* + * Deterimine layout number. If SA node and header == 0 then + * force the index table to the dummy "1" empty layout. + * + * The layout number would only be zero for a newly created file + * that has not added any attributes yet, or with crypto enabled which + * doesn't write any attributes to the bonus buffer. + */ + + search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); + + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + + /* Verify header size is consistent with layout information */ + ASSERT(tb); + ASSERT(IS_SA_BONUSTYPE(bonustype) && + SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || + (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); + + /* + * See if any of the already existing TOC entries can be reused? + */ + + for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; + idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { + boolean_t valid_idx = B_TRUE; + int i; + + if (tb->lot_var_sizes != 0 && + idx_tab->sa_variable_lengths != NULL) { + for (i = 0; i != tb->lot_var_sizes; i++) { + if (hdr->sa_lengths[i] != + idx_tab->sa_variable_lengths[i]) { + valid_idx = B_FALSE; + break; + } + } + } + if (valid_idx) { + sa_idx_tab_hold(os, idx_tab); + return (idx_tab); + } + } + + /* No such luck, create a new entry */ + idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); + idx_tab->sa_idx_tab = + kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); + idx_tab->sa_layout = tb; + refcount_create(&idx_tab->sa_refcount); + if (tb->lot_var_sizes) + idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * + tb->lot_var_sizes, KM_SLEEP); + + sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, + tb, idx_tab); + sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ + sa_idx_tab_hold(os, idx_tab); /* one for layout */ + list_insert_tail(&tb->lot_idx_tab, idx_tab); + return (idx_tab); +} + +void +sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, + boolean_t start, void *userdata) +{ + ASSERT(start); + + *dataptr = userdata; + *len = total_len; +} + +static void +sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) +{ + uint64_t attr_value = 0; + sa_os_t *sa = hdl->sa_os->os_sa; + sa_attr_table_t *tb = sa->sa_attr_table; + int i; + + mutex_enter(&sa->sa_lock); + + if (!sa->sa_need_attr_registration || sa->sa_master_obj == NULL) { + mutex_exit(&sa->sa_lock); + return; + } + + if (sa->sa_reg_attr_obj == NULL) { + int error; + sa->sa_reg_attr_obj = zap_create(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx); + error = zap_add(hdl->sa_os, sa->sa_master_obj, + SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx); + ASSERT(error == 0); + } + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_registered) + continue; + ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, + tb[i].sa_byteswap); + VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, + tb[i].sa_name, 8, 1, &attr_value, tx)); + tb[i].sa_registered = B_TRUE; + } + sa->sa_need_attr_registration = B_FALSE; + mutex_exit(&sa->sa_lock); +} + +/* + * Replace all attributes with attributes specified in template. + * If dnode had a spill buffer then those attributes will be + * also be replaced, possibly with just an empty spill block + * + * This interface is intended to only be used for bulk adding of + * attributes for a new file. It will also be used by the ZPL + * when converting and old formatted znode to native SA support. + */ +int +sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); +} + +int +sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_replace_all_by_template_locked(hdl, attr_desc, + attr_count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * add/remove/replace a single attribute and then rewrite the entire set + * of attributes. + */ +static int +sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + sa_bulk_attr_t *attr_desc; + void *old_data[2]; + int bonus_attr_count = 0; + int bonus_data_size, spill_data_size; + int spill_attr_count = 0; + int error; + uint16_t length; + int i, j, k, length_idx; + sa_hdr_phys_t *hdr; + sa_idx_tab_t *idx_tab; + int attr_count; + int count; + + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* First make of copy of the old data */ + + if (((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_bonuslen) { + bonus_data_size = hdl->sa_bonus->db_size; + old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); + bcopy(hdl->sa_bonus->db_data, old_data[0], + hdl->sa_bonus->db_size); + bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; + } else { + old_data[0] = NULL; + } + + /* Bring spill buffer online if it isn't currently */ + + if (sa_has_blkptr(hdl)) { + spill_data_size = hdl->sa_spill->db_size; + old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); + bcopy(hdl->sa_spill->db_data, old_data[1], + hdl->sa_spill->db_size); + spill_attr_count = + hdl->sa_spill_tab->sa_layout->lot_attr_count; + } else { + old_data[1] = NULL; + } + + /* build descriptor of all attributes */ + + attr_count = bonus_attr_count + spill_attr_count; + if (action == SA_ADD) + attr_count++; + else if (action == SA_REMOVE) + attr_count--; + + attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); + + /* + * loop through bonus and spill buffer if it exists, and + * build up new attr_descriptor to reset the attributes + */ + k = j = 0; + count = bonus_attr_count; + hdr = SA_GET_HDR(hdl, SA_BONUS); + idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); + for (; k != 2; k++) { + /* iterate over each attribute in layout */ + for (i = 0, length_idx = 0; i != count; i++) { + sa_attr_type_t attr; + + attr = idx_tab->sa_layout->lot_attrs[i]; + if (attr == newattr) { + if (action == SA_REMOVE) { + j++; + continue; + } + ASSERT(SA_REGISTERED_LEN(sa, attr) == 0); + ASSERT(action == SA_REPLACE); + SA_ADD_BULK_ATTR(attr_desc, j, attr, + locator, datastart, buflen); + } else { + length = SA_REGISTERED_LEN(sa, attr); + if (length == 0) { + length = hdr->sa_lengths[length_idx++]; + } + + SA_ADD_BULK_ATTR(attr_desc, j, attr, + NULL, (void *) + (TOC_OFF(idx_tab->sa_idx_tab[attr]) + + (uintptr_t)old_data[k]), length); + } + } + if (k == 0 && hdl->sa_spill) { + hdr = SA_GET_HDR(hdl, SA_SPILL); + idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); + count = spill_attr_count; + } else { + break; + } + } + if (action == SA_ADD) { + length = SA_REGISTERED_LEN(sa, newattr); + if (length == 0) { + length = buflen; + } + SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, + datastart, buflen); + } + + error = sa_build_layouts(hdl, attr_desc, attr_count, tx); + + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + if (old_data[1]) + kmem_free(old_data[1], spill_data_size); + kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); + + return (error); +} + +static int +sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + dmu_tx_t *tx) +{ + int error; + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_object_type_t bonustype; + + bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); + + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* sync out registration table if necessary */ + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + + error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); + if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) + sa->sa_update_cb(hdl, tx); + + return (error); +} + +/* + * update or add new attribute + */ +int +sa_update(sa_handle_t *hdl, sa_attr_type_t type, + void *buf, uint32_t buflen, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = type; + bulk.sa_data_func = NULL; + bulk.sa_length = buflen; + bulk.sa_data = buf; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, + uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = userdata; + bulk.sa_data_func = locator; + bulk.sa_length = buflen; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Return size of an attribute + */ + +int +sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) +{ + sa_bulk_attr_t bulk; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + if (sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) { + mutex_exit(&hdl->sa_lock); + return (ENOENT); + } + *size = bulk.sa_size; + + mutex_exit(&hdl->sa_lock); + return (0); +} + +int +sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_lookup_impl(hdl, attrs, count)); +} + +int +sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_lookup_locked(hdl, attrs, count); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, attrs, count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, + NULL, 0, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +void +sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) +{ + dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); +} + +void +sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) +{ + dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, + blksize, nblocks); +} + +void +sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) +{ + (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, + oldhdl, newhdl, NULL, sa_evict); + oldhdl->sa_bonus = NULL; +} + +void +sa_set_userp(sa_handle_t *hdl, void *ptr) +{ + hdl->sa_userp = ptr; +} + +dmu_buf_t * +sa_get_db(sa_handle_t *hdl) +{ + return ((dmu_buf_t *)hdl->sa_bonus); +} + +void * +sa_get_userdata(sa_handle_t *hdl) +{ + return (hdl->sa_userp); +} + +void +sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) +{ + ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); + os->os_sa->sa_update_cb = func; +} + +void +sa_register_update_callback(objset_t *os, sa_update_cb_t *func) +{ + + mutex_enter(&os->os_sa->sa_lock); + sa_register_update_callback_locked(os, func); + mutex_exit(&os->os_sa->sa_lock); +} + +uint64_t +sa_handle_object(sa_handle_t *hdl) +{ + return (hdl->sa_bonus->db_object); +} + +boolean_t +sa_enabled(objset_t *os) +{ + return (os->os_sa == NULL); +} + +int +sa_set_sa_object(objset_t *os, uint64_t sa_object) +{ + sa_os_t *sa = os->os_sa; + + if (sa->sa_master_obj) + return (1); + + sa->sa_master_obj = sa_object; + + return (0); +} + +int +sa_hdrsize(void *arg) +{ + sa_hdr_phys_t *hdr = arg; + + return (SA_HDR_SIZE(hdr)); +} + +void +sa_handle_lock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); +} + +void +sa_handle_unlock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_exit(&hdl->sa_lock); +} diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/dbuf.h --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Mar 16 09:43:38 2010 -0600 @@ -38,7 +38,6 @@ extern "C" { #endif -#define DB_BONUS_BLKID (-1ULL) #define IN_DMU_SYNC 2 /* @@ -242,6 +241,10 @@ dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); void dbuf_create_bonus(struct dnode *dn); +int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); +void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag); + +void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/dmu.h --- a/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Tue Mar 16 09:43:38 2010 -0600 @@ -63,6 +63,7 @@ struct nvlist; struct arc_buf; struct zio_prop; +struct sa_handle; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -122,6 +123,10 @@ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_SA, /* System attr */ + DMU_OT_SA_MASTER_NODE, /* ZAP */ + DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ + DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -159,6 +164,11 @@ #define DMU_DEADLIST_OBJECT (-3ULL) /* + * artificial blkids for bonus buffer and spill blocks + */ +#define DMU_BONUS_BLKID (-1ULL) +#define DMU_SPILL_BLKID (-2ULL) +/* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); @@ -314,6 +324,7 @@ */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 +#define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, struct zio_prop *zp); @@ -330,6 +341,17 @@ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); +int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); +int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); + +/* + * Special spill buffer support used by "SA" framework + */ + +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, + void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the @@ -443,6 +465,9 @@ uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); +void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/dmu_objset.h --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -99,6 +100,9 @@ /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; + + /* SA layout/attribute registration */ + sa_os_t *os_sa; }; #define DMU_META_OBJSET 0 @@ -146,7 +150,8 @@ int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); -void dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx); +void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before); boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/dmu_tx.h --- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -77,6 +77,7 @@ THT_FREE, THT_ZAP, THT_SPACE, + THT_SPILL, THT_NUMTYPES }; diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/dnode.h --- a/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Tue Mar 16 09:43:38 2010 -0600 @@ -63,6 +63,19 @@ #define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ /* + * dnode id flags + * + * Note: a file will never ever have its + * ids moved from bonus->spill + * and only in a crypto environment would it be on spill + */ +#define DN_ID_CHKED_BONUS 0x1 +#define DN_ID_CHKED_SPILL 0x2 +#define DN_ID_OLD_EXIST 0x4 +#define DN_ID_NEW_EXIST 0x8 +#define DN_ID_SYNC 0x10 + +/* * Derived constants. */ #define DNODE_SIZE (1 << DNODE_SHIFT) @@ -70,6 +83,7 @@ #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) #define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) #define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) +#define DN_KILL_SPILLBLK (1) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) @@ -102,6 +116,9 @@ #define DNODE_FLAG_USED_BYTES (1<<0) #define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) +/* Does dnode have a SA spill blkptr in bonus? */ +#define DNODE_FLAG_SPILL_BLKPTR (1<<2) + typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ @@ -122,7 +139,8 @@ uint64_t dn_pad3[4]; blkptr_t dn_blkptr[1]; - uint8_t dn_bonus[DN_MAX_BONUSLEN]; + uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)]; + blkptr_t dn_spill; } dnode_phys_t; typedef struct dnode { @@ -162,6 +180,8 @@ uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; + uint8_t dn_next_bonustype[TXG_SIZE]; + uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ @@ -186,12 +206,17 @@ kmutex_t dn_dbufs_mtx; list_t dn_dbufs; /* linked list of descendent dbuf_t's */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + boolean_t dn_have_spill; /* have spill or are spilling */ /* parent IO for current sync write */ zio_t *dn_zio; /* used in syncing context */ - dnode_phys_t *dn_oldphys; + uint64_t dn_oldused; /* old phys used bytes */ + uint64_t dn_oldflags; /* old phys dn_flags */ + uint64_t dn_olduid, dn_oldgid; + uint64_t dn_newuid, dn_newgid; + int dn_id_flags; /* holds prefetch structure */ struct zfetch dn_zfetch; @@ -208,6 +233,9 @@ void dnode_special_close(dnode_t *dn); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); +void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); +void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); + int dnode_hold(struct objset *dd, uint64_t object, void *ref, dnode_t **dnp); int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/sa.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/sa.h Tue Mar 16 09:43:38 2010 -0600 @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SA_H +#define _SYS_SA_H + +#include + +/* + * Currently available byteswap functions. + * If it all possible new attributes should used + * one of the already defined byteswap functions. + * If a new byteswap function is added then the + * ZPL/Pool version will need to be bumped. + */ + +typedef enum sa_bswap_type { + SA_UINT64_ARRAY, + SA_UINT32_ARRAY, + SA_UINT16_ARRAY, + SA_UINT8_ARRAY, + SA_ACL, +} sa_bswap_type_t; + +typedef uint16_t sa_attr_type_t; + +/* + * Attribute to register support for. + */ +typedef struct sa_attr_reg { + char *sa_name; /* attribute name */ + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; /* bswap functon enum */ + sa_attr_type_t sa_attr; /* filled in during registration */ +} sa_attr_reg_t; + + +typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, + boolean_t, void *userptr); + +/* + * array of attributes to store. + * + * This array should be treated as opaque/private data. + * The SA_BULK_ADD_ATTR() macro should be used for manipulating + * the array. + * + * When sa_replace_all_by_template() is used the attributes + * will be stored in the order defined in the array, except that + * the attributes may be split between the bonus and the spill buffer + * + */ +typedef struct sa_bulk_attr { + void *sa_data; + sa_data_locator_t *sa_data_func; + uint16_t sa_length; + sa_attr_type_t sa_attr; + /* the following are private to the sa framework */ + void *sa_addr; + uint16_t sa_buftype; + uint16_t sa_size; +} sa_bulk_attr_t; + + +/* + * special macro for adding entries for bulk attr support + * bulk - sa_bulk_attr_t + * count - integer that will be incremented during each add + * attr - attribute to manipulate + * func - function for accessing data. + * data - pointer to data. + * len - length of data + */ + +#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ +{ \ + b[idx].sa_attr = attr;\ + b[idx].sa_data_func = func; \ + b[idx].sa_data = data; \ + b[idx++].sa_length = len; \ +} + +typedef struct sa_os sa_os_t; + +typedef enum sa_handle_type { + SA_HDL_SHARED, + SA_HDL_PRIVATE +} sa_handle_type_t; + +struct sa_handle; +typedef void *sa_lookup_tab_t; +typedef struct sa_handle sa_handle_t; + +typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); + +int sa_handle_get(objset_t *, uint64_t, void *userp, + sa_handle_type_t, sa_handle_t **); +int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, + sa_handle_type_t, sa_handle_t **); +void sa_handle_destroy(sa_handle_t *); +int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); +void sa_buf_rele(dmu_buf_t *, void *); +int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); +int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, + uint32_t buflen, dmu_tx_t *); +int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); +int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); +int sa_size(sa_handle_t *, sa_attr_type_t, int *); +int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, + uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); +void sa_object_info(sa_handle_t *, dmu_object_info_t *); +void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); +void sa_update_user(sa_handle_t *, sa_handle_t *); +void *sa_get_userdata(sa_handle_t *); +void sa_set_userp(sa_handle_t *, void *); +dmu_buf_t *sa_get_db(sa_handle_t *); +uint64_t sa_handle_object(sa_handle_t *); +boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); +void sa_register_update_callback(objset_t *, sa_update_cb_t *); +sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int); +void sa_tear_down(objset_t *); +int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +boolean_t sa_enabled(objset_t *); +void sa_cache_init(); +void sa_cache_fini(); +int sa_set_sa_object(objset_t *, uint64_t); +int sa_hdrsize(void *); +void sa_handle_lock(sa_handle_t *); +void sa_handle_unlock(sa_handle_t *); + +#ifdef _KERNEL +int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_H */ diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/sa_impl.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h Tue Mar 16 09:43:38 2010 -0600 @@ -0,0 +1,288 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SA_IMPL_H +#define _SYS_SA_IMPL_H + +#include +#include +#include + +/* + * Array of known attributes and their + * various characteristics. + */ +typedef struct sa_attr_table { + sa_attr_type_t sa_attr; + uint8_t sa_registered; + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; + char *sa_name; +} sa_attr_table_t; + +/* + * Zap attribute format for attribute registration + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | unused | len | bswap | attr num | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Zap attribute format for layout information. + * + * layout information is stored as an array of attribute numbers + * The name of the attribute is the layout number (0, 1, 2, ...) + * + * 16 0 + * +---- ---+ + * | attr # | + * +--------+ + * | attr # | + * +--- ----+ + * ...... + * + */ + +#define ATTR_BSWAP(x) BF32_GET(x, 16, 8) +#define ATTR_LENGTH(x) BF32_GET(x, 24, 16) +#define ATTR_NUM(x) BF32_GET(x, 0, 16) +#define ATTR_ENCODE(x, attr, length, bswap) \ +{ \ + BF64_SET(x, 24, 16, length); \ + BF64_SET(x, 16, 8, bswap); \ + BF64_SET(x, 0, 16, attr); \ +} + +#define TOC_OFF(x) BF32_GET(x, 0, 23) +#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) +#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) +#define TOC_ATTR_ENCODE(x, len_idx, offset) \ +{ \ + BF32_SET(x, 31, 1, 1); \ + BF32_SET(x, 24, 7, len_idx); \ + BF32_SET(x, 0, 24, offset); \ +} + +#define SA_LAYOUTS "LAYOUTS" +#define SA_REGISTRY "REGISTRY" + +/* + * Each unique layout will have their own table + * sa_lot (layout_table) + */ +typedef struct sa_lot { + avl_node_t lot_num_node; + avl_node_t lot_hash_node; + uint64_t lot_num; + uint64_t lot_hash; + sa_attr_type_t *lot_attrs; /* array of attr #'s */ + uint32_t lot_var_sizes; /* how many aren't fixed size */ + uint32_t lot_attr_count; /* total attr count */ + list_t lot_idx_tab; /* should be only a couple of entries */ + int lot_instance; /* used with lot_hash to identify entry */ +} sa_lot_t; + +/* index table of offsets */ +typedef struct sa_idx_tab { + list_node_t sa_next; + sa_lot_t *sa_layout; + uint16_t *sa_variable_lengths; + refcount_t sa_refcount; + uint32_t *sa_idx_tab; /* array of offsets */ +} sa_idx_tab_t; + +/* + * Since the offset/index information into the actual data + * will usually be identical we can share that information with + * all handles that have the exact same offsets. + * + * You would typically only have a large number of different table of + * contents if you had a several variable sized attributes. + * + * Two AVL trees are used to track the attribute layout numbers. + * one is keyed by number and will be consulted when a DMU_OT_SA + * object is first read. The second tree is keyed by the hash signature + * of the attributes and will be consulted when an attribute is added + * to determine if we already have an instance of that layout. Both + * of these tree's are interconnected. The only difference is that + * when an entry is found in the "hash" tree the list of attributes will + * need to be compared against the list of attributes you have in hand. + * The assumption is that typically attributes will just be updated and + * adding a completely new attribute is a very rare operation. + */ +struct sa_os { + kmutex_t sa_lock; + boolean_t sa_need_attr_registration; + boolean_t sa_force_spill; + uint64_t sa_master_obj; + uint64_t sa_reg_attr_obj; + uint64_t sa_layout_attr_obj; + int sa_num_attrs; + sa_attr_table_t *sa_attr_table; /* private attr table */ + sa_update_cb_t *sa_update_cb; + avl_tree_t sa_layout_num_tree; /* keyed by layout number */ + avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ + int sa_user_table_sz; + sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ +}; + +/* + * header for all bonus and spill buffers. + * The header has a fixed portion with a variable number + * of "lengths" depending on the number of variable sized + * attribues which are determined by the "layout number" + */ + +#define SA_MAGIC 0x2F505A /* ZFS SA */ +typedef struct sa_hdr_phys { + uint32_t sa_magic; + uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ + uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ + /* ... Data follows the lengths. */ +} sa_hdr_phys_t; + +/* + * sa_hdr_phys -> sa_layout_info + * + * 16 10 0 + * +--------+-------+ + * | hdrsz |layout | + * +--------+-------+ + * + * Bits 0-10 are the layout number + * Bits 11-16 are the size of the header. + * The hdrsize is the number * 8 + * + * For example. + * hdrsz of 1 ==> 8 byte header + * 2 ==> 16 byte header + * + */ + +#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ +{ \ + BF32_SET_SB(x, 10, 6, 3, 0, size); \ + BF32_SET(x, 0, 10, num); \ +} + +typedef enum sa_buf_type { + SA_BONUS = 1, + SA_SPILL = 2 +} sa_buf_type_t; + +typedef enum sa_data_op { + SA_LOOKUP, + SA_UPDATE, + SA_ADD, + SA_REPLACE, + SA_REMOVE +} sa_data_op_t; + +/* + * Opaque handle used for most sa functions + * + * This needs to be kept as small as possible. + */ + +struct sa_handle { + kmutex_t sa_lock; + dmu_buf_t *sa_bonus; + dmu_buf_t *sa_spill; + objset_t *sa_os; + void *sa_userp; + sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ + sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ +}; + +#define SA_GET_DB(hdl, type) \ + (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) + +#define SA_GET_HDR(hdl, type) \ + ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ + type))->db.db_data)) + +#define SA_IDX_TAB_GET(hdl, type) \ + (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) + +#define IS_SA_BONUSTYPE(a) \ + ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) + +#define SA_BONUSTYPE_FROM_DB(db) \ + (((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype) + +#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) + +#define SA_LAYOUT_NUM(x, type) \ + ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ + ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) + + +#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length + +#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ + hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ + SA_REGISTERED_LEN(sa, attr)) + +#define SA_SET_HDR(hdr, num, size) \ + { \ + hdr->sa_magic = SA_MAGIC; \ + SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ + } + +#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ + { \ + bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ + bulk.sa_buftype = type; \ + bulk.sa_addr = \ + (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ + (uintptr_t)hdr); \ +} + +#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ + (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ + (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ + sizeof (uint16_t), 8) : 0))) + +int sa_add_impl(sa_handle_t *, sa_attr_type_t, + uint32_t, sa_data_locator_t, void *, dmu_tx_t *); + +void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); +int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); + +void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, + uint16_t *, sa_hdr_phys_t *); + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_IMPL_H */ diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/zfs_acl.h --- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -106,12 +107,18 @@ #define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) +/* + * Size of ACL count is always 2 bytes. + * Necessary to for dealing with both V0 ACL and V1 ACL layout + */ +#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t)) + typedef struct zfs_acl_phys { uint64_t z_acl_extern_obj; /* ext acl pieces */ uint32_t z_acl_size; /* Number of bytes in ACL */ uint16_t z_acl_version; /* acl version */ uint16_t z_acl_count; /* ace count */ - uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ } zfs_acl_phys_t; typedef struct acl_ops { @@ -146,21 +153,26 @@ void *z_allocdata; /* pointer to kmem allocated memory */ size_t z_allocsize; /* Size of blob in bytes */ size_t z_size; /* length of ACL data */ - int z_ace_count; /* number of ACEs in this acl node */ + uint64_t z_ace_count; /* number of ACEs in this acl node */ int z_ace_idx; /* ace iterator positioned on */ } zfs_acl_node_t; typedef struct zfs_acl { - int z_acl_count; /* Number of ACEs */ + uint64_t z_acl_count; /* Number of ACEs */ size_t z_acl_bytes; /* Number of bytes in ACL */ uint_t z_version; /* version of ACL */ void *z_next_ace; /* pointer to next ACE */ - int z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ list_t z_acl; /* chunks of ACE data */ acl_ops_t z_ops; /* ACL operations */ } zfs_acl_t; +typedef struct acl_locator_cb { + zfs_acl_t *cb_aclp; + zfs_acl_node_t *cb_acl_node; +} zfs_acl_locator_cb_t; + #define ACL_DATA_ALLOCED 0x1 #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) @@ -174,6 +186,10 @@ struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ } zfs_acl_ids_t; +#define ZFS_EXTERNAL_ACL(zp) \ + (zp->z_is_sa ? 0 : zfs_external_acl(zp)) +#define ZNODE_ACL_VERSION(zp) \ + (zp->z_is_sa ? ZFS_ACL_VERSION_FUID : zfs_znode_acl_version(zp)) /* * Property values for acl_mode and acl_inherit. * @@ -215,6 +231,14 @@ int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, struct zfs_fuid_info **, zfs_acl_t **); int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); +uint64_t zfs_external_acl(struct znode *); +int zfs_znode_acl_version(struct znode *); +int zfs_acl_size(struct znode *, int *); +zfs_acl_t *zfs_acl_alloc(int); +zfs_acl_node_t *zfs_acl_node_alloc(size_t); +void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *); +void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, uint64_t *); #endif diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/zfs_dir.h --- a/usr/src/uts/common/fs/zfs/sys/zfs_dir.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -57,7 +57,7 @@ extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, pathname_t *); extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, - uint_t, znode_t **, int, zfs_acl_ids_t *); + uint_t, znode_t **, zfs_acl_ids_t *); extern void zfs_rmnode(znode_t *); extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h Tue Mar 16 09:43:38 2010 -0600 @@ -71,12 +71,13 @@ #define DMU_BACKUP_FEATURE_DEDUP (0x1) #define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2) +#define DMU_BACKUP_FEATURE_SA_SPILL (0x4) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ - DMU_BACKUP_FEATURE_DEDUPPROPS) + DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -118,7 +119,7 @@ enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, - DRR_NUMTYPES + DRR_SPILL, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { @@ -188,6 +189,13 @@ uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ } drr_write_byref; + struct drr_spill { + uint64_t drr_object; + uint64_t drr_length; + uint64_t drr_toguid; + uint64_t drr_pad[4]; /* needed for crypto */ + /* spill data follows */ + } drr_spill; } drr_u; } dmu_replay_record_t; diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/zfs_sa.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_sa.h Tue Mar 16 09:43:38 2010 -0600 @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_SA_H +#define _SYS_ZFS_SA_H + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include + + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the list of known attributes + * to the ZPL. The values of the actual + * attributes are not defined by the order + * the enums. It is controlled by the attribute + * registration mechanism. Two different file system + * could have different numeric values for the same + * attributes. this list is only used for dereferencing + * into the table that will hold the actual numeric value. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_END +} zpl_attr_t; + +#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108 +#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \ + sizeof (zfs_acl_phys_t)) + +#define SA_MODE_OFFSET 0 +#define SA_SIZE_OFFSET 8 +#define SA_GEN_OFFSET 16 +#define SA_UID_OFFSET 24 +#define SA_GID_OFFSET 32 +#define SA_PARENT_OFFSET 40 + +extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1]; +extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1]; + +/* + * This is a deprecated data structure that only exists for + * dealing with file systems create prior to ZPL version 5. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +} znode_phys_t; + +#ifdef _KERNEL +int zfs_sa_readlink(struct znode *, uio_t *); +void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); +void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); +void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); +void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); +void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *); +void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_SA_H */ diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h --- a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ #endif typedef struct zfsvfs zfsvfs_t; +struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ @@ -73,11 +75,13 @@ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ }; @@ -140,8 +144,10 @@ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota); -extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs, - boolean_t isgroup, uint64_t fuid); +extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *, + boolean_t isgroup); +extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, + uint64_t fuid); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp); extern void zfsvfs_free(zfsvfs_t *zfsvfs); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/sys/zfs_znode.h --- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,8 +32,10 @@ #include #include #include +#include #include #include +#include #endif #include #include @@ -59,12 +61,14 @@ #define ZFS_AV_MODIFIED 0x0000040000000000 #define ZFS_REPARSE 0x0000080000000000 -#define ZFS_ATTR_SET(zp, attr, value) \ +#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ { \ if (value) \ - zp->z_phys->zp_flags |= attr; \ + pflags |= attr; \ else \ - zp->z_phys->zp_flags &= ~attr; \ + pflags &= ~attr; \ + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \ + &pflags, sizeof (pflags), tx)); \ } /* @@ -80,6 +84,27 @@ #define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ #define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ +#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] +#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] +#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] +#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] +#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] +#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] +#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] +#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] +#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] +#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] +#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] +#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] +#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] +#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] +#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] +#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] +#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] +#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] +#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] +#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] + /* * Is ID ephemeral? */ @@ -88,8 +113,10 @@ /* * Should we use FUIDs? */ -#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID &&\ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) +#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) #define MASTER_NODE_OBJ 1 @@ -104,6 +131,7 @@ #define ZPL_VERSION_STR "VERSION" #define ZFS_FUID_TABLES "FUID" #define ZFS_SHARES_DIR "SHARES" +#define ZFS_SA_ATTRS "SA_ATTRS" #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) @@ -132,42 +160,6 @@ #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) /* - * This is the persistent portion of the znode. It is stored - * in the "bonus buffer" of the file. Short symbolic links - * are also stored in the bonus buffer. - */ -typedef struct znode_phys { - uint64_t zp_atime[2]; /* 0 - last file access time */ - uint64_t zp_mtime[2]; /* 16 - last file modification time */ - uint64_t zp_ctime[2]; /* 32 - last file change time */ - uint64_t zp_crtime[2]; /* 48 - creation time */ - uint64_t zp_gen; /* 64 - generation (txg of creation) */ - uint64_t zp_mode; /* 72 - file mode bits */ - uint64_t zp_size; /* 80 - size of file */ - uint64_t zp_parent; /* 88 - directory parent (`..') */ - uint64_t zp_links; /* 96 - number of links to file */ - uint64_t zp_xattr; /* 104 - DMU object for xattrs */ - uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ - uint64_t zp_flags; /* 120 - persistent flags */ - uint64_t zp_uid; /* 128 - file owner */ - uint64_t zp_gid; /* 136 - owning group */ - uint64_t zp_zap; /* 144 - extra attributes */ - uint64_t zp_pad[3]; /* 152 - future */ - zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ - /* - * Data may pad out any remaining bytes in the znode buffer, eg: - * - * |<---------------------- dnode_phys (512) ------------------------>| - * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| - * |<---- znode (264) ---->|<---- data (56) ---->| - * - * At present, we use this space for the following: - * - symbolic links - * - 32-byte anti-virus scanstamp (regular files only) - */ -} znode_phys_t; - -/* * Directory entry locks control access to directory entries. * They are used to protect creates, deletes, and renames. * Each directory znode has a mutex and a list of locked names. @@ -200,16 +192,20 @@ uint_t z_seq; /* modification sequence number */ uint64_t z_mapcnt; /* number of pages mapped to file */ uint64_t z_last_itx; /* last ZIL itx on this znode */ - uint64_t z_gen; /* generation (same as zp_gen) */ + uint64_t z_gen; /* generation (cached) */ + uint64_t z_size; /* file size (cached) */ + uint64_t z_atime[2]; /* atime (cached) */ + uint64_t z_links; /* file links (cached) */ + uint64_t z_pflags; /* pflags (cached) */ + uid_t z_uid; /* uid mapped (cached) */ + uid_t z_gid; /* gid mapped (cached) */ + mode_t z_mode; /* mode (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ list_node_t z_link_node; /* all znodes in fs link */ - /* - * These are dmu managed fields. - */ - znode_phys_t *z_phys; /* pointer to persistent znode */ - dmu_buf_t *z_dbuf; /* buffer containing the z_phys */ + sa_handle_t *z_sa_hdl; /* handle to sa data */ + boolean_t z_is_sa; /* are we native sa? */ } znode_t; @@ -252,7 +248,7 @@ #define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) #define ZFS_VERIFY_ZP(zp) \ - if ((zp)->z_dbuf == NULL) { \ + if ((zp)->z_sa_hdl == NULL) { \ ZFS_EXIT((zp)->z_zfsvfs); \ return (EIO); \ } \ @@ -294,14 +290,14 @@ #define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ - zfs_time_stamper(zp, ACCESSED, NULL) + zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE); extern int zfs_init_fs(zfsvfs_t *, znode_t **); extern void zfs_set_dataprop(objset_t *); extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, dmu_tx_t *tx); -extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *); -extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *); +extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], + uint64_t [2], boolean_t); extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); extern void zfs_znode_init(void); @@ -340,7 +336,7 @@ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); -extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_acl.c --- a/usr/src/uts/common/fs/zfs/zfs_acl.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_acl.c Tue Mar 16 09:43:38 2010 -0600 @@ -50,6 +50,7 @@ #include #include #include +#include #include "fs/fs_subr.h" #include @@ -321,6 +322,82 @@ zfs_ace_fuid_data }; +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ + +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (0); + + VERIFY(0 == sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))); + + return (acl_phys.z_acl_extern_obj); +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) { + return (ZFS_ACL_VERSION_FUID); + } else { + VERIFY(0 == sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))); + return (acl_phys.z_acl_version); + } +} + static int zfs_acl_version(int version) { @@ -336,7 +413,7 @@ return (zfs_acl_version(zp->z_zfsvfs->z_version)); } -static zfs_acl_t * +zfs_acl_t * zfs_acl_alloc(int vers) { zfs_acl_t *aclp; @@ -352,7 +429,7 @@ return (aclp); } -static zfs_acl_node_t * +zfs_acl_node_t * zfs_acl_node_alloc(size_t bytes) { zfs_acl_node_t *aclnode; @@ -463,6 +540,8 @@ { zfs_acl_node_t *aclnode; + ASSERT(aclp); + if (start == NULL) { aclnode = list_head(&aclp->z_acl); if (aclnode == NULL) @@ -509,6 +588,7 @@ *who = aclp->z_ops.ace_who_get(acep); aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; aclnode->z_ace_idx++; + return ((void *)acep); } return (NULL); @@ -542,7 +622,7 @@ */ int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, - void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) { int i; @@ -773,8 +853,8 @@ * Determine mode of file based on ACL. * Also, create FUIDs for any User/Group ACEs */ -static uint64_t -zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, uint64_t *pflags) { int entry_type; mode_t mode; @@ -785,7 +865,7 @@ uint32_t access_mask; boolean_t an_exec_denied = B_FALSE; - mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { @@ -930,48 +1010,13 @@ an_exec_denied = B_TRUE; if (an_exec_denied) - zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED; + *pflags &= ~ZFS_NO_EXECS_DENIED; else - zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED; + *pflags |= ZFS_NO_EXECS_DENIED; return (mode); } -static zfs_acl_t * -zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) -{ - zfs_acl_t *aclp; - zfs_acl_node_t *aclnode; - - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - - /* - * Version 0 to 1 znode_acl_phys has the size/count fields swapped. - * Version 0 didn't have a size field, only a count. - */ - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size; - aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count); - } else { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size; - } - - aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0); - aclnode->z_ace_count = aclp->z_acl_count; - if (will_modify) { - bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata, - aclp->z_acl_bytes); - } else { - aclnode->z_size = aclp->z_acl_bytes; - aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0]; - } - - list_insert_head(&aclp->z_acl, aclnode); - - return (aclp); -} - /* * Read an external acl object. If the intent is to modify, always * create a new acl and leave any cached acl in place. @@ -979,12 +1024,13 @@ static int zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) { - uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; zfs_acl_t *aclp; - size_t aclsize; - size_t acl_count; + int aclsize; + int acl_count; zfs_acl_node_t *aclnode; - int error; + zfs_acl_phys_t znode_acl; + int version; + int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); @@ -993,48 +1039,69 @@ return (0); } - if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { - *aclpp = zfs_acl_node_read_internal(zp, will_modify); - if (!will_modify) - zp->z_acl_cached = *aclpp; - return (0); - } - - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - zfs_acl_phys_v0_t *zacl0 = - (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl; - - aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count); - acl_count = zacl0->z_acl_count; - } else { - aclsize = zp->z_phys->zp_acl.z_acl_size; - acl_count = zp->z_phys->zp_acl.z_acl_count; - if (aclsize == 0) - aclsize = acl_count * sizeof (zfs_ace_t); - } - aclnode = zfs_acl_node_alloc(aclsize); - list_insert_head(&aclp->z_acl, aclnode); - error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - aclsize, aclnode->z_acldata, DMU_READ_PREFETCH); - aclnode->z_ace_count = acl_count; + version = ZNODE_ACL_VERSION(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) + return (error); + + aclp = zfs_acl_alloc(version); + aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + if (error != 0) { zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); /* convert checksum errors into IO errors */ if (error == ECKSUM) error = EIO; return (error); } + list_insert_head(&aclp->z_acl, aclnode); + *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; return (0); } +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + /* * common code for setting ACLs. * @@ -1045,28 +1112,33 @@ int zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { - int error; - znode_phys_t *zphys = zp->z_phys; - zfs_acl_phys_t *zacl = &zphys->zp_acl; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; - uint64_t off = 0; - dmu_object_type_t otype; - zfs_acl_node_t *aclnode; - - dmu_buf_will_dirty(zp->z_dbuf, tx); + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + + mode = zp->z_mode; + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } - zphys->zp_mode = zfs_mode_compute(zp, aclp); - /* - * Decide which object type to use. If we are forced to - * use old ACL format then transform ACL into zfs_oldace_t - * layout. + * Upgrade needed? */ if (!zfsvfs->z_use_fuids) { otype = DMU_OT_OLDACL; @@ -1078,84 +1150,113 @@ otype = DMU_OT_ACL; } - if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - /* - * If ACL was previously external and we are now - * converting to new ACL format then release old - * ACL object and create a new one. - */ - if (aoid && aclp->z_version != zacl->z_acl_version) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - aoid = 0; - } - if (aoid == 0) { - aoid = dmu_object_alloc(zfsvfs->z_os, - otype, aclp->z_acl_bytes, - otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, - otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx); + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + zfs_acl_phys_t acl_phys; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } } else { - (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, - aclp->z_acl_bytes, 0, tx); - } - zphys->zp_acl.z_acl_extern_obj = aoid; - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); - off += aclnode->z_size; + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } } - } else { - void *start = zacl->z_ace_data; /* - * Migrating back embedded? + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. */ - if (zphys->zp_acl.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - zphys->zp_acl.z_acl_extern_obj = 0; + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; } - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - bcopy(aclnode->z_acldata, start, aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); } /* - * If Old version then swap count/bytes to match old - * layout of znode_acl_phys_t. - */ - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - zphys->zp_acl.z_acl_size = aclp->z_acl_count; - zphys->zp_acl.z_acl_count = aclp->z_acl_bytes; - } else { - zphys->zp_acl.z_acl_size = aclp->z_acl_bytes; - zphys->zp_acl.z_acl_count = aclp->z_acl_count; - } - - zphys->zp_acl.z_acl_version = aclp->z_version; - - /* * Replace ACL wide bits, but first clear them. */ - zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS; - - zp->z_phys->zp_flags |= aclp->z_hints; + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) - zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; - - return (0); + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } /* @@ -1643,11 +1744,20 @@ mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); + *aclp = NULL; error = zfs_acl_node_read(zp, aclp, B_TRUE); if (error == 0) { - (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS; - zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp); + uint64_t owner; + if (IS_EPHEMERAL(zp->z_uid)) + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_UID(zp->z_zfsvfs), + &owner, sizeof (owner))) != 0) + return (error); + else + owner = (uint64_t)zp->z_uid; + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_zfsvfs, owner, mode, *aclp); } mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); @@ -1716,7 +1826,7 @@ *need_chmod = B_TRUE; pacep = NULL; aclp = zfs_acl_alloc(paclp->z_version); - if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD) + if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || vtype == VLNK) return (aclp); while (pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type)) { @@ -1837,6 +1947,8 @@ zfs_acl_t *paclp; gid_t gid; boolean_t need_chmod = B_TRUE; + boolean_t inherited = B_FALSE; + uint64_t parentgid; bzero(acl_ids, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -1845,7 +1957,6 @@ if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) return (error); - /* * Determine uid and gid. */ @@ -1859,6 +1970,12 @@ ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { + if (IS_EPHEMERAL(dzp->z_gid)) + VERIFY(0 == sa_lookup(dzp->z_sa_hdl, SA_ZPL_GID(zfsvfs), + &parentgid, sizeof (parentgid))); + else + parentgid = (uint64_t)dzp->z_gid; + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, cr, &acl_ids->z_fuidp); acl_ids->z_fgid = 0; @@ -1867,17 +1984,17 @@ (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; - if (acl_ids->z_fgid != dzp->z_phys->zp_gid && + if (acl_ids->z_fgid != parentgid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; } if (acl_ids->z_fgid == 0) { - if (dzp->z_phys->zp_mode & S_ISGID) { + if (dzp->z_mode & S_ISGID) { char *domain; uint32_t rid; - acl_ids->z_fgid = dzp->z_phys->zp_gid; + acl_ids->z_fgid = parentgid; gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); @@ -1907,7 +2024,7 @@ * file's new group, clear the file's set-GID bit. */ - if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) && + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && (vap->va_type == VDIR)) { acl_ids->z_mode |= S_ISGID; } else { @@ -1919,26 +2036,35 @@ if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_lock); if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && - (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) && - !(dzp->z_phys->zp_flags & ZFS_XATTR)) { + (dzp->z_pflags & ZFS_INHERIT_ACE)) && + !(dzp->z_pflags & ZFS_XATTR)) { mutex_enter(&dzp->z_acl_lock); VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); - mutex_exit(&dzp->z_acl_lock); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + mutex_exit(&dzp->z_acl_lock); + inherited = B_TRUE; } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } mutex_exit(&dzp->z_lock); if (need_chmod) { - acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ? + acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ? ZFS_ACL_AUTO_INHERIT : 0; zfs_acl_chmod(zfsvfs, acl_ids->z_fuid, acl_ids->z_mode, acl_ids->z_aclp); } } + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + return (0); } @@ -1959,8 +2085,8 @@ boolean_t zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) { - return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || - zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); + return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); } /* @@ -1978,12 +2104,12 @@ mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + if (mask == 0) + return (ENOSYS); + if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) return (error); - if (mask == 0) - return (ENOSYS); - mutex_enter(&zp->z_acl_lock); error = zfs_acl_node_read(zp, &aclp, B_FALSE); @@ -1995,8 +2121,7 @@ /* * Scan ACL to determine number of ACEs */ - if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) && - !(mask & VSA_ACE_ALLTYPES)) { + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { void *zacep = NULL; uint64_t who; uint32_t access_mask; @@ -2017,7 +2142,7 @@ } vsecp->vsa_aclcnt = count; } else - count = aclp->z_acl_count; + count = (int)aclp->z_acl_count; if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = count; @@ -2051,11 +2176,11 @@ } if (mask & VSA_ACE_ACLFLAGS) { vsecp->vsa_aclflags = 0; - if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED) + if (zp->z_pflags & ZFS_ACL_DEFAULTED) vsecp->vsa_aclflags |= ACL_DEFAULTED; - if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED) + if (zp->z_pflags & ZFS_ACL_PROTECTED) vsecp->vsa_aclflags |= ACL_PROTECTED; - if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT) + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } @@ -2137,7 +2262,7 @@ if (mask == 0) return (ENOSYS); - if (zp->z_phys->zp_flags & ZFS_IMMUTABLE) + if (zp->z_pflags & ZFS_IMMUTABLE) return (EPERM); if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) @@ -2153,37 +2278,40 @@ * existing flags. */ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { - aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL? */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - zp->z_phys->zp_acl.z_acl_version == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, aclp->z_acl_bytes); - } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); - } + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if (ZFS_EXTERNAL_ACL(zp)) { + if (zfsvfs->z_version <= ZPL_VERSION_SA && + ZNODE_ACL_VERSION(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0, + DMU_OBJECT_END); + } else { + dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), + 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); @@ -2206,7 +2334,6 @@ if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) @@ -2239,19 +2366,19 @@ */ if ((v4_mode & WRITE_MASK_DATA) && (((ZTOV(zp)->v_type != VDIR) && - (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || (ZTOV(zp)->v_type == VDIR && - (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { + (zp->z_pflags & ZFS_IMMUTABLE)))) { return (EPERM); } if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && - (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { + (zp->z_pflags & ZFS_NOUNLINK)) { return (EPERM); } if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && - (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { + (zp->z_pflags & ZFS_AV_QUARANTINED))) { return (EACCES); } @@ -2298,10 +2425,7 @@ uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; - uid_t fowner; - uid_t gowner; - - zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + uint64_t gowner; mutex_enter(&zp->z_acl_lock); @@ -2311,6 +2435,12 @@ return (error); } + ASSERT(zp->z_acl_cached); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GID(zfsvfs), + &gowner, sizeof (gowner))) != 0) + return (error); + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { uint32_t mask_matched; @@ -2332,7 +2462,7 @@ switch (entry_type) { case ACE_OWNER: - if (uid == fowner) + if (uid == zp->z_uid) checkit = B_TRUE; break; case OWNING_GROUP: @@ -2410,17 +2540,14 @@ uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { - uid_t owner; - - owner = zfs_fuid_map_id(zp->z_zfsvfs, - zp->z_phys->zp_uid, cr, ZFS_OWNER); - return ( - secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 || - secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 || - secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 || - secpolicy_vnode_chown(cr, owner) == 0 || - secpolicy_vnode_setdac(cr, owner) == 0 || + secpolicy_vnode_access(cr, ZTOV(zp), + zp->z_uid, VREAD) == 0 || secpolicy_vnode_access(cr, + ZTOV(zp), zp->z_uid, VWRITE) == 0 || + secpolicy_vnode_access(cr, ZTOV(zp), + zp->z_uid, VEXEC) == 0 || + secpolicy_vnode_chown(cr, zp->z_uid) == 0 || + secpolicy_vnode_setdac(cr, zp->z_uid) == 0 || secpolicy_vnode_remove(cr) == 0); } return (B_TRUE); @@ -2479,38 +2606,33 @@ boolean_t owner = B_FALSE; boolean_t groupmbr = B_FALSE; boolean_t is_attr; - uid_t fowner; - uid_t gowner; uid_t uid = crgetuid(cr); int error; - if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + if (zdp->z_pflags & ZFS_AV_QUARANTINED) return (EACCES); - is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) && + is_attr = ((zdp->z_pflags & ZFS_XATTR) && (ZTOV(zdp)->v_type == VDIR)); if (is_attr) goto slow; + mutex_enter(&zdp->z_acl_lock); - if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) { + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { mutex_exit(&zdp->z_acl_lock); return (0); } - if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 || - FUID_INDEX(zdp->z_phys->zp_gid) != 0) { + if (IS_EPHEMERAL(zdp->z_uid) != 0 || IS_EPHEMERAL(zdp->z_gid) != 0) { mutex_exit(&zdp->z_acl_lock); goto slow; } - fowner = (uid_t)zdp->z_phys->zp_uid; - gowner = (uid_t)zdp->z_phys->zp_gid; - - if (uid == fowner) { + if (uid == zdp->z_uid) { owner = B_TRUE; - if (zdp->z_phys->zp_mode & S_IXUSR) { + if (zdp->z_mode & S_IXUSR) { mutex_exit(&zdp->z_acl_lock); return (0); } else { @@ -2518,9 +2640,9 @@ goto slow; } } - if (groupmember(gowner, cr)) { + if (groupmember(zdp->z_gid, cr)) { groupmbr = B_TRUE; - if (zdp->z_phys->zp_mode & S_IXGRP) { + if (zdp->z_mode & S_IXGRP) { mutex_exit(&zdp->z_acl_lock); return (0); } else { @@ -2529,7 +2651,7 @@ } } if (!owner && !groupmbr) { - if (zdp->z_phys->zp_mode & S_IXOTH) { + if (zdp->z_mode & S_IXOTH) { mutex_exit(&zdp->z_acl_lock); return (0); } @@ -2555,20 +2677,25 @@ uint32_t working_mode; int error; int is_attr; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; boolean_t check_privs; znode_t *xzp; znode_t *check_zp = zp; - is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && - (ZTOV(zp)->v_type == VDIR)); + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); /* * If attribute then validate against base file */ if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + if ((error = zfs_zget(zp->z_zfsvfs, - zp->z_phys->zp_parent, &xzp)) != 0) { + parent, &xzp)) != 0) { return (error); } @@ -2607,12 +2734,8 @@ } if (error && check_privs) { - uid_t owner; mode_t checkmode = 0; - owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr, - ZFS_OWNER); - /* * First check for implicit owner permission on * read_acl/read_attributes @@ -2622,7 +2745,7 @@ ASSERT(working_mode != 0); if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && - owner == crgetuid(cr))) + zp->z_uid == crgetuid(cr))) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| @@ -2636,19 +2759,19 @@ if (checkmode) error = secpolicy_vnode_access(cr, ZTOV(check_zp), - owner, checkmode); + zp->z_uid, checkmode); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(cr, owner); + error = secpolicy_vnode_chown(cr, zp->z_uid); if (error == 0 && (working_mode & ACE_WRITE_ACL)) - error = secpolicy_vnode_setdac(cr, owner); + error = secpolicy_vnode_setdac(cr, zp->z_uid); if (error == 0 && (working_mode & (ACE_DELETE|ACE_DELETE_CHILD))) error = secpolicy_vnode_remove(cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(cr, owner); + error = secpolicy_vnode_chown(cr, zp->z_uid); } if (error == 0) { /* @@ -2693,12 +2816,9 @@ mode_t missing_perms, cred_t *cr) { int error; - uid_t downer; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER); - - error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms); + + error = secpolicy_vnode_access(cr, ZTOV(dzp), + dzp->z_uid, missing_perms); if (error == 0) error = zfs_sticky_remove_access(dzp, zp, cr); @@ -2765,7 +2885,7 @@ * to determine what was found. */ - if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) return (EPERM); /* @@ -2835,7 +2955,7 @@ int add_perm; int error; - if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + if (szp->z_pflags & ZFS_AV_QUARANTINED) return (EACCES); add_perm = (ZTOV(szp)->v_type == VDIR) ? diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_byteswap.c --- a/usr/src/uts/common/fs/zfs/zfs_byteswap.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_byteswap.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,6 +27,7 @@ #include #include #include +#include #include void diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_ctldir.c --- a/usr/src/uts/common/fs/zfs/zfs_ctldir.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -215,6 +215,7 @@ { vnode_t *vp, *rvp; zfsctl_node_t *zcp; + uint64_t crtime[2]; ASSERT(zfsvfs->z_ctldir == NULL); @@ -225,7 +226,9 @@ zcp->zc_id = ZFSCTL_INO_ROOT; VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0); - ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof (crtime))); + ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime); VN_RELE(rvp); /* diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_dir.c --- a/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,6 +52,8 @@ #include #include #include +#include +#include #include #include @@ -286,8 +288,10 @@ * See if there's an object by this name; if so, put a hold on it. */ if (flag & ZXATTR) { - zoid = dzp->z_phys->zp_xattr; - error = (zoid == 0 ? ENOENT : 0); + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); } else { if (update) vp = dnlc_lookup(ZTOV(dzp), name); @@ -379,25 +383,29 @@ zfs_dirlock_t *dl; znode_t *zp; int error = 0; + uint64_t parent; if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { *vpp = ZTOV(dzp); VN_HOLD(*vpp); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ - if (dzp->z_phys->zp_parent == dzp->z_id && - zfsvfs->z_parent != zfsvfs) { + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, "snapshot", vpp, NULL, 0, NULL, kcred, NULL, NULL, NULL); return (error); } rw_enter(&dzp->z_parent_lock, RW_READER); - error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); + error = zfs_zget(zfsvfs, parent, &zp); if (error == 0) *vpp = ZTOV(zp); rw_exit(&dzp->z_parent_lock); @@ -445,7 +453,7 @@ zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); - ASSERT3U(zp->z_phys->zp_links, ==, 0); + ASSERT(zp->z_links == 0); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); @@ -540,10 +548,12 @@ (ZTOV(xzp)->v_type == VLNK)); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); - dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); @@ -576,15 +586,16 @@ znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; + uint64_t xattr_obj; int error; + ASSERT(zp->z_links == 0); ASSERT(ZTOV(zp)->v_count == 0); - ASSERT(zp->z_phys->zp_links == 0); /* * If this is an attribute directory, purge its contents. */ - if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { + if (ZTOV(zp)->v_type == VDIR && (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. @@ -613,12 +624,14 @@ * If the file has extended attributes, we're going to unlink * the xattr dir. */ - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT(error == 0); } - acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + acl_obj = ZFS_EXTERNAL_ACL(zp); /* * Set up the final transaction. @@ -627,11 +640,13 @@ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xzp) { - dmu_tx_hold_bonus(tx, xzp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* @@ -646,10 +661,12 @@ } if (xzp) { - dmu_buf_will_dirty(xzp->z_dbuf, tx); + ASSERT(error == 0); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ - xzp->z_phys->zp_links = 0; /* no more links to it */ + xzp->z_links = 0; /* no more links to it */ + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx)); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); } @@ -667,11 +684,12 @@ } static uint64_t -zfs_dirent(znode_t *zp) +zfs_dirent(znode_t *zp, uint64_t mode) { uint64_t de = zp->z_id; + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) - de |= IFTODT((zp)->z_phys->zp_mode) << 60; + de |= IFTODT(mode) << 60; return (de); } @@ -682,12 +700,15 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) { znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); uint64_t value; int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; int error; - dmu_buf_will_dirty(zp->z_dbuf, tx); mutex_enter(&zp->z_lock); if (!(flag & ZRENAMING)) { @@ -696,22 +717,47 @@ mutex_exit(&zp->z_lock); return (ENOENT); } - zp->z_phys->zp_links++; + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + } - zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); - if (!(flag & ZNEW)) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime, B_TRUE); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&zp->z_lock); - dmu_buf_will_dirty(dzp->z_dbuf, tx); mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size++; /* one dirent added */ - dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); mutex_exit(&dzp->z_lock); - value = zfs_dirent(zp); + value = zfs_dirent(zp, zp->z_mode); error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, 8, 1, &value, tx); ASSERT(error == 0); @@ -733,16 +779,18 @@ boolean_t *unlinkedp) { znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; vnode_t *vp = ZTOV(zp); int zp_is_dir = (vp->v_type == VDIR); boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; int error; dnlc_remove(ZTOV(dzp), dl->dl_name); if (!(flag & ZRENAMING)) { - dmu_buf_will_dirty(zp->z_dbuf, tx); - if (vn_vfswlock(vp)) /* prevent new mounts on zp */ return (EBUSY); @@ -752,35 +800,58 @@ } mutex_enter(&zp->z_lock); - if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ + + if (zp_is_dir && !zfs_dirempty(zp)) { mutex_exit(&zp->z_lock); vn_vfsunlock(vp); return (EEXIST); } - if (zp->z_phys->zp_links <= zp_is_dir) { + + if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on %s is %u, " "should be at least %u", zp->z_vnode->v_path ? zp->z_vnode->v_path : - "", (int)zp->z_phys->zp_links, + "", (int)zp->z_links, zp_is_dir + 1); - zp->z_phys->zp_links = zp_is_dir + 1; + zp->z_links = zp_is_dir + 1; } - if (--zp->z_phys->zp_links == zp_is_dir) { + if (--zp->z_links == zp_is_dir) { zp->z_unlinked = B_TRUE; - zp->z_phys->zp_links = 0; + zp->z_links = 0; unlinked = B_TRUE; } else { - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT(error == 0); mutex_exit(&zp->z_lock); vn_vfsunlock(vp); } - dmu_buf_will_dirty(dzp->z_dbuf, tx); mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size--; /* one dirent removed */ - dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); mutex_exit(&dzp->z_lock); if (zp->z_zfsvfs->z_norm) { @@ -815,7 +886,7 @@ boolean_t zfs_dirempty(znode_t *dzp) { - return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); + return (dzp->z_size == 2 && dzp->z_dirlocks == 0); } int @@ -827,6 +898,7 @@ int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; + uint64_t parent; *xvpp = NULL; @@ -842,7 +914,9 @@ } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) @@ -855,14 +929,18 @@ dmu_tx_abort(tx); return (error); } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids); + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - ASSERT(xzp->z_phys->zp_parent == zp->z_id); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zp->z_phys->zp_xattr = xzp->z_id; + if ((error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) + return (0); + + ASSERT(parent == zp->z_id); + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); @@ -907,7 +985,6 @@ return (0); } - ASSERT(zp->z_phys->zp_xattr == 0); if (!(flags & CREATE_XATTR_DIR)) { zfs_dirent_unlock(dl); @@ -962,20 +1039,14 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) { uid_t uid; - uid_t downer; - uid_t fowner; - zfsvfs_t *zfsvfs = zdp->z_zfsvfs; if (zdp->z_zfsvfs->z_replay) return (0); - if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) + if ((zdp->z_mode & S_ISVTX) == 0) return (0); - downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER); - fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER); - - if ((uid = crgetuid(cr)) == downer || uid == fowner || + if ((uid = crgetuid(cr)) == zdp->z_uid || uid == zp->z_uid || (ZTOV(zp)->v_type == VREG && zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) return (0); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_fuid.c --- a/usr/src/uts/common/fs/zfs/zfs_fuid.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_fuid.c Tue Mar 16 09:43:38 2010 -0600 @@ -389,10 +389,26 @@ void zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) { - *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid, - cr, ZFS_OWNER); - *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid, - cr, ZFS_GROUP); + uint64_t fuid, fgid; + sa_bulk_attr_t bulk[2]; + int count = 0; + + if (IS_EPHEMERAL(zp->z_uid) || IS_EPHEMERAL(zp->z_gid)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zp->z_zfsvfs), + NULL, &fuid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zp->z_zfsvfs), + NULL, &fgid, 8); + VERIFY(0 == sa_bulk_lookup(zp->z_sa_hdl, bulk, count)); + } + if (IS_EPHEMERAL(zp->z_uid)) + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + else + *uidp = zp->z_uid; + if (IS_EPHEMERAL(zp->z_gid)) + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, + zp->z_gid, cr, ZFS_GROUP); + else + *gidp = zp->z_gid; } uid_t diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_ioctl.c --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Tue Mar 16 09:43:38 2010 -0600 @@ -68,6 +68,7 @@ #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" +#include "zfs_comutil.h" extern struct modlfs zfs_modlfs; @@ -1954,20 +1955,10 @@ case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; - uint64_t maxzplver = ZPL_VERSION; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0) break; - if (zfs_earlier_version(dsname, SPA_VERSION_USERSPACE)) - maxzplver = ZPL_VERSION_USERSPACE - 1; - if (zfs_earlier_version(dsname, SPA_VERSION_FUID)) - maxzplver = ZPL_VERSION_FUID - 1; - if (intval > maxzplver) { - zfsvfs_rele(zfsvfs, FTAG); - return (ENOTSUP); - } - err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); @@ -2558,8 +2549,8 @@ */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, - boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, - boolean_t *is_ci) + boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; @@ -2595,6 +2586,7 @@ */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || + (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) @@ -2636,11 +2628,13 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - boolean_t fuids_ok = B_TRUE; + boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[MAXNAMELEN]; char *cp; + spa_t *spa; + uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); @@ -2648,12 +2642,15 @@ ASSERT(cp != NULL); cp[0] = '\0'; - if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE)) - zplver = ZPL_VERSION_USERSPACE - 1; - if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { - zplver = ZPL_VERSION_FUID - 1; - fuids_ok = B_FALSE; - } + if ((error = spa_open(dataset, &spa, FTAG)) != 0) + return (error); + + spa_vers = spa_version(spa); + spa_close(spa, FTAG); + + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. @@ -2661,7 +2658,7 @@ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); - error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops, + error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); @@ -2671,17 +2668,17 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - boolean_t fuids_ok = B_TRUE; + boolean_t fuids_ok; + boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; - if (spa_vers < SPA_VERSION_FUID) { - zplver = ZPL_VERSION_FUID - 1; - fuids_ok = B_FALSE; - } - - error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops, - zplprops, is_ci); + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); + + error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, + createprops, zplprops, is_ci); return (error); } diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_log.c --- a/usr/src/uts/common/fs/zfs/zfs_log.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_log.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -276,21 +276,25 @@ lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) { - lr->lr_uid = (uint64_t)zp->z_phys->zp_uid; + lr->lr_mode = zp->z_mode; + if (!IS_EPHEMERAL(zp->z_uid)) { + lr->lr_uid = (uint64_t)zp->z_uid; } else { lr->lr_uid = fuidp->z_fuid_owner; } - if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) { - lr->lr_gid = (uint64_t)zp->z_phys->zp_gid; + if (!IS_EPHEMERAL(zp->z_gid)) { + lr->lr_gid = (uint64_t)zp->z_gid; } else { lr->lr_gid = fuidp->z_fuid_group; } - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; - lr->lr_rdev = zp->z_phys->zp_rdev; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); + + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev, + sizeof (lr->lr_rdev)) != 0) + lr->lr_rdev = 0; /* * Fill in xvattr info if any @@ -404,12 +408,13 @@ lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - lr->lr_uid = zp->z_phys->zp_uid; - lr->lr_gid = zp->z_phys->zp_gid; - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + lr->lr_uid = zp->z_uid; + lr->lr_gid = zp->z_gid; + lr->lr_mode = zp->z_mode; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); bcopy(name, (char *)(lr + 1), namesize); bcopy(link, (char *)(lr + 1) + namesize, linksize); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_replay.c --- a/usr/src/uts/common/fs/zfs/zfs_replay.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_replay.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -645,7 +645,7 @@ length = lr->lr_length; eod = offset + length; /* end of data for this write */ - orig_eof = zp->z_phys->zp_size; + orig_eof = zp->z_size; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { @@ -667,8 +667,8 @@ * write needs to be there. So we write the whole block and * reduce the eof. */ - if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */ - zp->z_phys->zp_size = eod; + if (orig_eof < zp->z_size) /* file length grew ? */ + zp->z_size = eod; VN_RELE(ZTOV(zp)); @@ -695,9 +695,9 @@ return (error); end = lr->lr_offset + lr->lr_length; - if (end > zp->z_phys->zp_size) { - ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz); - zp->z_phys->zp_size = end; + if (end > zp->z_size) { + ASSERT3U(end - zp->z_size, <, zp->z_blksz); + zp->z_size = end; } VN_RELE(ZTOV(zp)); diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_rlock.c --- a/usr/src/uts/common/fs/zfs/zfs_rlock.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -112,7 +112,7 @@ * Range locking is also used by zvol and uses a * dummied up znode. However, for zvol, we don't need to * append or grow blocksize, and besides we don't have - * a z_phys or z_zfsvfs - so skip that processing. + * a "sa" data or z_zfsvfs - so skip that processing. * * Yes, this is ugly, and would be solved by not handling * grow or append in range lock code. If that was done then @@ -125,14 +125,14 @@ * This is done under z_range_lock to avoid races. */ if (new->r_type == RL_APPEND) - new->r_off = zp->z_phys->zp_size; + new->r_off = zp->z_size; /* * If we need to grow the block size then grab the whole * file range. This is also done under z_range_lock to * avoid races. */ - end_size = MAX(zp->z_phys->zp_size, new->r_off + len); + end_size = MAX(zp->z_size, new->r_off + len); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { new->r_off = 0; diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_sa.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/zfs_sa.c Tue Mar 16 09:43:38 2010 -0600 @@ -0,0 +1,340 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +/* + * ZPL attribute registration table. + * Order of attributes doesn't matter + * a unique value will be assigned for each + * attribute that is file system specific + * + * This is just the set of ZPL attributes that this + * version of ZFS deals with natively. The file system + * could have other attributes stored in files, but they will be + * ignored. The SA framework will preserve them, just that + * this version of ZFS won't change or delete them. + */ + +sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, + {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, + {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, + {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, + {"ZPL_DACL_ACES", 0, SA_ACL, 0}, + {NULL, 0, 0, 0} +}; + +#ifdef _KERNEL + +int +zfs_sa_readlink(znode_t *zp, uio_t *uio) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + size_t bufsz; + int error; + + bufsz = zp->z_size; + if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE < db->db_size) { + error = uiomove((caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id, + 0, FTAG, &dbp)) == 0) { + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + } + return (error); +} + +void +zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + + if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { + VERIFY(dmu_set_bonus(db, + len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); + if (len) { + bcopy(link, (caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, len); + } + } else { + dmu_buf_t *dbp; + + zfs_grow_blocksize(zp, len, tx); + VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os, + zp->z_id, 0, FTAG, &dbp)); + + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } +} + +void +zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) { + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp)) != 0) + return; + } else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) + return; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + + if (len <= doi.doi_bonus_size) { + (void) memcpy(xoap->xoa_av_scanstamp, + (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + sizeof (xoap->xoa_av_scanstamp)); + } + } + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); +} + +void +zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), tx)); + else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(db, len, tx) == 0); + (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + + zp->z_pflags |= ZFS_BONUS_SCANSTAMP; + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &zp->z_pflags, sizeof (uint64_t), tx)); + } +} + +/* + * I'm not convinced we should do any of this upgrade. + * since the SA code can read both old/new znode formats + * with probably little to know performance difference. + * + * All new files will be created with the new format. + */ + +void +zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(hdl); + znode_t *zp = sa_get_userdata(hdl); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + sa_bulk_attr_t bulk[20]; + int count = 0; + sa_bulk_attr_t sa_attrs[20] = { 0 }; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t uid, gid, mode, rdev, xattr, parent; + uint64_t crtime[2], mtime[2], ctime[2]; + zfs_acl_phys_t znode_acl; + char *slink = NULL; + char scanstamp[AV_SCANSTAMP_SZ]; + + /* + * No upgrade if ACL isn't cached + * since we won't know which locks are held + * and ready the ACL would require special "locked" + * interfaces that would be messy + */ + if (zp->z_acl_cached == NULL) + return; + + /* First do a bulk query of the attributes that aren't cached */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &znode_acl, 88); + + if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) + return; + + + /* + * While the order here doesn't matter its best to try and organize + * it is such a way to pick up an already existing layout number + */ + count = 0; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs), + NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, + zp->z_atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &zp->z_acl_cached->z_acl_count, 8); + + if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) + zfs_acl_xform(zp, zp->z_acl_cached, CRED()); + + locate.cb_aclp = zp->z_acl_cached; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); + if (xattr) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + /* + * is it a symlink ? + * + * this will probably never be exercised since we won't + * have the cached ACL. + */ + if (ZTOV(zp)->v_type == VLNK) { + slink = kmem_zalloc(zp->z_size + 1, KM_SLEEP); + if (zp->z_size + ZFS_OLD_ZNODE_PHYS_SIZE) + bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + slink, zp->z_size); + else { + dmu_buf_t *dbp; + if (dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, + FTAG, &dbp)) + return; + bcopy(dbp->db_data, slink, zp->z_size); + dmu_buf_rele(dbp, FTAG); + } + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SYMLINK(zfsvfs), + NULL, slink, zp->z_size); + } + + /* if scanstamp then add scanstamp */ + + if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { + bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + scanstamp, AV_SCANSTAMP_SZ); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), + NULL, scanstamp, AV_SCANSTAMP_SZ); + zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; + } + + VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); + VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, + count, tx) == 0); + if (znode_acl.z_acl_extern_obj) + VERIFY(0 == dmu_object_free(zfsvfs->z_os, + znode_acl.z_acl_extern_obj, tx)); + + zp->z_is_sa = B_TRUE; + if (slink) + kmem_free(slink, zp->z_size + 1); + +} + +void +zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) +{ + if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa) + return; + + ASSERT(!zp->z_is_sa); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + if (ZFS_EXTERNAL_ACL(zp)) { + dmu_tx_hold_free(tx, ZFS_EXTERNAL_ACL(zp), 0, + DMU_OBJECT_END); + } +} + +#endif diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_vfsops.c --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c Tue Mar 16 09:43:38 2010 -0600 @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,8 @@ #include #include #include +#include +#include "zfs_comutil.h" int zfsfstype; vfsops_t *zfs_vfsops = NULL; @@ -582,6 +585,7 @@ (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); err = zap_lookup(os, obj, buf, 8, 1, &used); + ASSERT(err == 0 || err == ENOENT); /* no underflow/overflow */ ASSERT(delta > 0 || used >= -delta); @@ -592,20 +596,38 @@ else err = zap_update(os, obj, buf, 8, 1, &used, tx); ASSERT(err == 0); + } static int -zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus, +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { - znode_phys_t *znp = bonus; + znode_phys_t *znp = data; + int error = 0; - if (bonustype != DMU_OT_ZNODE) + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (ENOENT); - *userp = znp->zp_uid; - *groupp = znp->zp_gid; - return (0); + if (bonustype == DMU_OT_ZNODE) { + *userp = znp->zp_uid; + *groupp = znp->zp_gid; + } else { + int hdrsize; + + ASSERT(bonustype == DMU_OT_SA); + hdrsize = sa_hdrsize(data); + + if (hdrsize != 0) { + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + } else { + error = ENOENT; + } + } + return (error); } static void @@ -792,7 +814,7 @@ } boolean_t -zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; @@ -815,6 +837,32 @@ return (used >= quota); } +boolean_t +zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) +{ + uint64_t fuid; + uint64_t quotaobj; + uid_t id; + + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + id = isgroup ? zp->z_gid : zp->z_uid; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + if (IS_EPHEMERAL(id)) { + VERIFY(0 == sa_lookup(zp->z_sa_hdl, + isgroup ? SA_ZPL_GID(zfsvfs) : SA_ZPL_UID(zfsvfs), + &fuid, sizeof (fuid))); + } else { + fuid = (uint64_t)id; + } + + return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); +} + + int zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { @@ -822,6 +870,7 @@ zfsvfs_t *zfsvfs; uint64_t zval; int i, error; + uint64_t sa_obj; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); @@ -879,6 +928,26 @@ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error) + return (error); + } else { + /* + * Pre SA versions file systems should never touch + * either the attribute registration or layout objects. + */ + sa_obj = 0; + } + + zfsvfs->z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); @@ -1051,6 +1120,7 @@ vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int @@ -1732,7 +1802,7 @@ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) - if (zp->z_dbuf) { + if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count > 0); zfs_znode_dmu_fini(zp); } @@ -1927,7 +1997,9 @@ ZFS_EXIT(zfsvfs); return (err); } - zp_gen = zp->z_phys->zp_gen & gen_mask; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { @@ -1966,7 +2038,7 @@ int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { - int err; + int err, err2; ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); @@ -1977,6 +2049,17 @@ zfsvfs->z_os = NULL; } else { znode_t *zp; + uint64_t sa_obj = 0; + + err2 = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj); + + if ((err || err2) && zfsvfs->z_version >= ZPL_VERSION_SA) + goto bail; + + + zfsvfs->z_attr_table = sa_setup(zfsvfs->z_os, sa_obj, + zfs_attr_table, ZPL_END); VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); @@ -1995,6 +2078,7 @@ } +bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrw_exit(&zfsvfs->z_teardown_lock, FTAG); @@ -2111,13 +2195,23 @@ if (newvers < zfsvfs->z_version) return (EINVAL); + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (ENOTSUP); + tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); @@ -2126,6 +2220,22 @@ return (error); } + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT3U(error, ==, 0); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + spa_history_internal_log(LOG_DS_UPGRADE, dmu_objset_spa(os), tx, CRED(), "oldver=%llu newver=%llu dataset = %llu", diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_vnops.c --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Mar 16 09:43:38 2010 -0600 @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,7 @@ #include "fs/fs_subr.h" #include #include +#include #include #include #include @@ -176,7 +178,7 @@ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (EPERM); @@ -184,8 +186,7 @@ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) { + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (EACCES); @@ -223,8 +224,7 @@ if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); @@ -244,7 +244,7 @@ int error; boolean_t hole; - file_sz = zp->z_phys->zp_size; + file_sz = zp->z_size; if (noff >= file_sz) { return (ENXIO); } @@ -453,7 +453,7 @@ ZFS_VERIFY_ZP(zp); os = zfsvfs->z_os; - if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { + if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (EACCES); } @@ -477,7 +477,7 @@ /* * Check for mandatory locks */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); @@ -500,13 +500,13 @@ * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ - if (uio->uio_loffset >= zp->z_phys->zp_size) { + if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } - ASSERT(uio->uio_loffset < zp->z_phys->zp_size); - n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + ASSERT(uio->uio_loffset < zp->z_size); + n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { @@ -531,8 +531,8 @@ */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, - dmu_request_arcbuf(zp->z_dbuf, blksz), - 0, blksz); + dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz), 0, blksz); } } } @@ -580,6 +580,7 @@ * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ + /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) @@ -596,7 +597,6 @@ ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; - uint64_t pflags; int error; arc_buf_t *abuf; iovec_t *aiov; @@ -605,6 +605,9 @@ int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write @@ -619,13 +622,19 @@ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + /* * If immutable or not appending then return EPERM */ - pflags = zp->z_phys->zp_flags; - if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_phys->zp_size))) { + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (EPERM); } @@ -635,7 +644,7 @@ /* * Validate file offset */ - woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset; + woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (EINVAL); @@ -645,7 +654,7 @@ * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && + if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); @@ -678,7 +687,7 @@ * the file block size to increase. * Note that zp_size cannot change with this lock held. */ - woff = zp->z_phys->zp_size; + woff = zp->z_size; } uio->uio_loffset = woff; } else { @@ -700,9 +709,9 @@ n = limit - woff; /* Will this write extend the file length? */ - write_eof = (woff + n > zp->z_phys->zp_size); - - end_size = MAX(zp->z_phys->zp_size, woff + n); + write_eof = (woff + n > zp->z_size); + + end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written @@ -713,10 +722,8 @@ abuf = NULL; woff = uio->uio_loffset; again: - if (zfs_usergroup_overquota(zfsvfs, - B_FALSE, zp->z_phys->zp_uid) || - zfs_usergroup_overquota(zfsvfs, - B_TRUE, zp->z_phys->zp_gid)) { + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = EDQUOT; @@ -735,7 +742,7 @@ aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && - woff >= zp->z_phys->zp_size && + woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* @@ -747,7 +754,8 @@ */ size_t cbytes; - abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, @@ -762,8 +770,9 @@ * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (error == ERESTART) { @@ -825,7 +834,8 @@ xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); - dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), + woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); @@ -840,6 +850,8 @@ * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; @@ -853,33 +865,35 @@ * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); - if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && - (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, - (zp->z_phys->zp_mode & S_ISUID) != 0 && - zp->z_phys->zp_uid == 0) != 0) { - zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); - /* - * Update time stamp. NOTE: This marks the bonus buffer as - * dirty, so we don't have to do it again for zp_size. - */ - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ - while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) - (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); + ASSERT(error == 0); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); @@ -983,7 +997,7 @@ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_phys->zp_size) { + if (offset >= zp->z_size) { error = ENOENT; } else { error = dmu_read(os, object, offset, size, buf, @@ -1010,7 +1024,7 @@ zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_phys->zp_size) + if (lr->lr_offset >= zp->z_size) error = ENOENT; #ifdef DEBUG if (zil_fault_io) { @@ -1132,7 +1146,7 @@ if (dvp->v_type != VDIR) { return (ENOTDIR); - } else if (zdp->z_dbuf == NULL) { + } else if (zdp->z_sa_hdl == NULL) { return (EIO); } @@ -1184,7 +1198,7 @@ * We don't allow recursive attributes.. * Maybe someday we will. */ - if (zdp->z_phys->zp_flags & ZFS_XATTR) { + if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -1277,7 +1291,7 @@ ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; + zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; /* @@ -1344,6 +1358,7 @@ return (error); } } + if (zp == NULL) { uint64_t txtype; @@ -1359,7 +1374,8 @@ * We only support the creation of regular files in * extended attribute directories. */ - if ((dzp->z_phys->zp_flags & ZFS_XATTR) && + + if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = EINVAL; goto out; @@ -1375,15 +1391,19 @@ } tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); + 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { @@ -1398,13 +1418,12 @@ ZFS_EXIT(zfsvfs); return (error); } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dl, zp, tx, ZNEW); - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; @@ -1490,6 +1509,9 @@ * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ + +uint64_t null_xattr = 0; + /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, @@ -1500,7 +1522,8 @@ vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - uint64_t acl_obj, xattr_obj; + uint64_t acl_obj, xattr_obj = 0; + uint64_t xattr_obj_unlinked = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; @@ -1566,24 +1589,29 @@ */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = - zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; + zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ - if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { - /* XXX - do we need this if we are deleting? */ - dmu_tx_hold_bonus(tx, xattr_obj); + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT3U(error, ==, 0); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* are there any additional acls */ - if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && - may_delete_now) + if ((acl_obj = ZFS_EXTERNAL_ACL(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); /* charge as an update -- would be nice not to charge at all */ @@ -1616,26 +1644,37 @@ } if (unlinked) { + mutex_enter(&vp->v_lock); + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && - zp->z_phys->zp_xattr == xattr_obj && - zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; + xattr_obj == xattr_obj_unlinked && ZFS_EXTERNAL_ACL(zp) == + acl_obj; mutex_exit(&vp->v_lock); } if (delete_now) { - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT3U(error, ==, 0); - ASSERT3U(xzp->z_phys->zp_links, ==, 2); - dmu_buf_will_dirty(xzp->z_dbuf, tx); + if (xattr_obj_unlinked) { + ASSERT3U(xzp->z_links, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; - xzp->z_phys->zp_links = 0; + xzp->z_links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx); + ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); - zp->z_phys->zp_xattr = 0; /* probably unnecessary */ + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT3U(error, ==, 0); } mutex_enter(&zp->z_lock); mutex_enter(&vp->v_lock); @@ -1707,7 +1746,7 @@ ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); - zfs_acl_ids_t acl_ids; + zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); @@ -1731,7 +1770,7 @@ ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; - if (dzp->z_phys->zp_flags & ZFS_XATTR) { + if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -1791,9 +1830,14 @@ fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_acl_ids_free(&acl_ids); @@ -1811,10 +1855,11 @@ /* * Create new node. */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); + /* * Now put new name in parent dir. */ @@ -1829,6 +1874,7 @@ acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); zfs_dirent_unlock(dl); @@ -1920,8 +1966,10 @@ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); @@ -2003,6 +2051,7 @@ zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; int local_eof; int outcount; int error; @@ -2012,6 +2061,12 @@ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* * If we are not given an eof variable, * use a local one. @@ -2099,7 +2154,7 @@ } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; - objnum = zp->z_phys->zp_parent; + objnum = parent; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; @@ -2293,24 +2348,32 @@ { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_phys_t *pzp; int error = 0; uint64_t links; + uint64_t mtime[2], ctime[2]; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[2]; + int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - pzp = zp->z_phys; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ - if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && - (pzp->zp_uid != crgetuid(cr))) { + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (zp->z_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); @@ -2325,16 +2388,17 @@ mutex_enter(&zp->z_lock); vap->va_type = vp->v_type; - vap->va_mode = pzp->zp_mode & MODEMASK; - zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + vap->va_mode = zp->z_mode & MODEMASK; + vap->va_uid = zp->z_uid; + vap->va_gid = zp->z_gid; vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) - links = pzp->zp_links + 1; + links = zp->z_links + 1; else - links = pzp->zp_links; + links = zp->z_links; vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ - vap->va_size = pzp->zp_size; + vap->va_size = zp->z_size; vap->va_rdev = vp->v_rdev; vap->va_seq = zp->z_seq; @@ -2345,115 +2409,97 @@ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = - ((pzp->zp_flags & ZFS_ARCHIVE) != 0); + ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = - ((pzp->zp_flags & ZFS_READONLY) != 0); + ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = - ((pzp->zp_flags & ZFS_SYSTEM) != 0); + ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = - ((pzp->zp_flags & ZFS_HIDDEN) != 0); + ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = - ((pzp->zp_flags & ZFS_NOUNLINK) != 0); + ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = - ((pzp->zp_flags & ZFS_APPENDONLY) != 0); + ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = - ((pzp->zp_flags & ZFS_NODUMP) != 0); + ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = - ((pzp->zp_flags & ZFS_OPAQUE) != 0); + ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && - vp->v_type == VREG && - (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - - /* - * Only VREG files have anti-virus scanstamps, so we - * won't conflict with symlinks in the bonus buffer. - */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len <= doi.doi_bonus_size) { - /* - * pzp points to the start of the - * znode_phys_t. pzp + 1 points to the - * first byte after the znode_phys_t. - */ - (void) memcpy(xoap->xoa_av_scanstamp, - pzp + 1, - sizeof (xoap->xoa_av_scanstamp)); - XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); - } + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); + uint64_t times[2]; + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)); + ZFS_TIME_DECODE(&xoap->xoa_createtime, times); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - xoap->xoa_reparse = - ((pzp->zp_flags & ZFS_REPARSE) != 0); + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } } - ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); - ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); - ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); mutex_exit(&zp->z_lock); - dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks); + sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); if (zp->z_blksz == 0) { /* @@ -2490,7 +2536,6 @@ caller_context_t *ct) { znode_t *zp = VTOZ(vp); - znode_phys_t *pzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; @@ -2501,15 +2546,19 @@ int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; + uint64_t xattr_obj = 0; + uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; - int err; + int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - boolean_t fuid_dirtied = B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; if (mask == 0) return (0); @@ -2520,7 +2569,6 @@ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - pzp = zp->z_phys; zilog = zfsvfs->z_log; /* @@ -2557,14 +2605,14 @@ /* * Immutable files can only alter immutable bit and atime */ - if ((pzp->zp_flags & ZFS_IMMUTABLE) && + if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (EPERM); } - if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { + if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (EPERM); } @@ -2621,9 +2669,10 @@ XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); + } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); @@ -2636,7 +2685,7 @@ */ if (!(mask & AT_MODE)) - vap->va_mode = pzp->zp_mode; + vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of @@ -2674,8 +2723,9 @@ } mutex_enter(&zp->z_lock); - oldva.va_mode = pzp->zp_mode; - zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + oldva.va_mode = zp->z_mode; + oldva.va_uid = zp->z_uid; + oldva.va_gid = zp->z_gid; if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes @@ -2686,7 +2736,7 @@ */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != - ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); @@ -2696,7 +2746,7 @@ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != - ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); @@ -2706,7 +2756,7 @@ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); @@ -2716,7 +2766,7 @@ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != - ((pzp->zp_flags & ZFS_NODUMP) != 0)) { + ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); @@ -2726,7 +2776,7 @@ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); @@ -2738,7 +2788,7 @@ if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); @@ -2805,78 +2855,83 @@ */ mask = vap->va_mask; - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - - if (mask & AT_MODE) { - uint64_t pmode = pzp->zp_mode; - - new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) - goto out; - if (pzp->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL from old V0 format to new V1 */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - pzp->zp_acl.z_acl_version == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - pzp->zp_acl.z_acl_extern_obj, 0, - DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } else { - dmu_tx_hold_write(tx, - pzp->zp_acl.z_acl_extern_obj, 0, - aclp->z_acl_bytes); - } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); - } - } - - if (mask & (AT_UID | AT_GID)) { - if (pzp->zp_xattr) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if ((mask & (AT_UID | AT_GID))) { + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, + sizeof (xattr_obj)); + + if (xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err) - goto out; - dmu_tx_hold_bonus(tx, attrzp->z_id); + goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (new_uid != pzp->zp_uid && - zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { + if (vap->va_uid != zp->z_uid && + zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { err = EDQUOT; - goto out; + goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); - if (new_gid != pzp->zp_gid && - zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { + if (new_gid != zp->z_gid && + zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { err = EDQUOT; - goto out; - } - } - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); + goto out2; } } } + tx = dmu_tx_create(zfsvfs->z_os); + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) + goto out; + + if (!zp->z_is_sa && ZFS_EXTERNAL_ACL(zp)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version <= ZPL_VERSION_FUID && + ZNODE_ACL_VERSION(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, + ZFS_EXTERNAL_ACL(zp), 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, ZFS_EXTERNAL_ACL(zp), 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if ((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err) { @@ -2885,8 +2940,7 @@ goto out; } - dmu_buf_will_dirty(zp->z_dbuf, tx); - + count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. @@ -2897,9 +2951,38 @@ mutex_enter(&zp->z_lock); + if (attrzp) + mutex_enter(&attrzp->z_lock); + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = zfs_fuid_map_id(zfsvfs, new_uid, cr, ZFS_OWNER); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_gid = zp->z_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &new_gid, sizeof (new_gid)); + zp->z_gid = zfs_fuid_map_id(zfsvfs, new_gid, cr, ZFS_GROUP); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = zp->z_gid; + } + } + if (mask & AT_MODE) { mutex_enter(&zp->z_acl_lock); - zp->z_phys->zp_mode = new_mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); zp->z_acl_cached = aclp; @@ -2908,34 +2991,42 @@ } if (attrzp) - mutex_enter(&attrzp->z_lock); - - if (mask & AT_UID) { - pzp->zp_uid = new_uid; - if (attrzp) - attrzp->z_phys->zp_uid = new_uid; + mutex_exit(&attrzp->z_lock); + + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); } - if (mask & AT_GID) { - pzp->zp_gid = new_gid; - if (attrzp) - attrzp->z_phys->zp_gid = new_gid; + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); } - if (attrzp) - mutex_exit(&attrzp->z_lock); - - if (mask & AT_ATIME) - ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); - - if (mask & AT_MTIME) - ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); - /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ - if (mask & AT_SIZE) - zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); - else if (mask != 0) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + if (!(mask & AT_MTIME)) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime, B_TRUE); + } + } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit @@ -2967,20 +3058,12 @@ XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); - /* Grow the bonus buffer if necessary. */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len > doi.doi_bonus_size) - VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); - } - zfs_xvattr_set(zp, xvap); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) @@ -2992,9 +3075,14 @@ mutex_exit(&zp->z_lock); out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + if (attrzp) VN_RELE(ZTOV(attrzp)); - if (aclp) zfs_acl_free(aclp); @@ -3003,14 +3091,17 @@ fuidp = NULL; } - if (err) + if (err) { dmu_tx_abort(tx); - else + if (err == ERESTART) + goto top; + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); - - if (err == ERESTART) - goto top; - + } + + +out2: ZFS_EXIT(zfsvfs); return (err); } @@ -3050,7 +3141,7 @@ zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; - uint64_t *oidp = &zp->z_id; + uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; @@ -3072,7 +3163,7 @@ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; - oidp = &zp->z_id; + oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; @@ -3090,19 +3181,20 @@ zl->zl_next = *zlpp; *zlpp = zl; - if (*oidp == szp->z_id) /* We're a descendant of szp */ + if (oidp == szp->z_id) /* We're a descendant of szp */ return (EINVAL); - if (*oidp == rootid) /* We've hit the top */ + if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); + int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); if (error) return (error); zl->zl_znode = zp; } - oidp = &zp->z_phys->zp_parent; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), + &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; @@ -3182,8 +3274,7 @@ * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ - if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != - (sdzp->z_phys->zp_flags & ZFS_XATTR)) { + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -3363,14 +3454,20 @@ } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ - dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) - dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ - if (tzp) - dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { @@ -3401,10 +3498,14 @@ if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { - szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; + szp->z_pflags |= ZFS_AV_MODIFIED; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT3U(error, ==, 0); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - ASSERT(error == 0); + ASSERT3U(error, ==, 0); zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), @@ -3462,11 +3563,12 @@ dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - int len = strlen(link); + uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; ASSERT(vap->va_type == VLNK); @@ -3511,10 +3613,14 @@ tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); @@ -3531,50 +3637,33 @@ return (error); } - dmu_buf_will_dirty(dzp->z_dbuf, tx); - /* * Create a new object for the symlink. - * Put the link content into bonus buffer if it will fit; - * otherwise, store it just like any other file data. + * for version 4 ZPL datsets the symlink will be an SA attribute */ - if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); - if (len != 0) - bcopy(link, zp->z_phys + 1, len); - } else { - dmu_buf_t *dbp; - - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* - * Nothing can access the znode yet so no locking needed - * for growing the znode's blocksize. - */ - zfs_grow_blocksize(zp, len, tx); - - VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, - zp->z_id, 0, FTAG, &dbp)); - dmu_buf_will_dirty(dbp, tx); - - ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); - dmu_buf_rele(dbp, FTAG); - } - zp->z_phys->zp_size = len; - + + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); - if (error == 0) { - uint64_t txtype = TX_SYMLINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - } + + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_acl_ids_free(&acl_ids); @@ -3611,29 +3700,19 @@ { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - size_t bufsz; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - bufsz = (size_t)zp->z_phys->zp_size; - if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { - error = uiomove(zp->z_phys + 1, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - } else { - dmu_buf_t *dbp; - error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - dmu_buf_rele(dbp, FTAG); - } + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); return (error); } @@ -3668,7 +3747,6 @@ vnode_t *realvp; int error; int zf = ZNEW; - uid_t owner; ASSERT(tdvp->v_type == VDIR); @@ -3701,8 +3779,7 @@ * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ - if ((szp->z_phys->zp_flags & ZFS_XATTR) != - (dzp->z_phys->zp_flags & ZFS_XATTR)) { + if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -3716,8 +3793,7 @@ return (EPERM); } - owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); - if (owner != crgetuid(cr) && + if (szp->z_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (EPERM); @@ -3738,8 +3814,10 @@ } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); @@ -3815,10 +3893,8 @@ dmu_tx_t *tx; u_offset_t off, koff; size_t len, klen; - uint64_t filesz; int err; - filesz = zp->z_phys->zp_size; off = pp->p_offset; len = PAGESIZE; /* @@ -3826,12 +3902,12 @@ * multiple pages so that we write a full block (thus avoiding * a read-modify-write). */ - if (off < filesz && zp->z_blksz > PAGESIZE) { + if (off < zp->z_size && zp->z_blksz > PAGESIZE) { klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; - ASSERT(koff <= filesz); - if (koff + klen > filesz) - klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); + ASSERT(koff <= zp->z_size); + if (koff + klen > zp->z_size) + klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); @@ -3839,30 +3915,32 @@ /* * Can't push pages past end-of-file. */ - if (off >= filesz) { + if (off >= zp->z_size) { /* ignore all pages */ err = 0; goto out; - } else if (off + len > filesz) { - int npages = btopr(filesz - off); + } else if (off + len > zp->z_size) { + int npages = btopr(zp->z_size - off); page_t *trunc; page_list_break(&pp, &trunc, npages); /* ignore pages past end of file */ if (trunc) pvn_write_done(trunc, flags); - len = filesz - off; + len = zp->z_size - off; } - if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) || - zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) { + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { err = EDQUOT; goto out; } top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); - dmu_tx_hold_bonus(tx, zp->z_id); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) { @@ -3884,7 +3962,16 @@ } if (err == 0) { - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[2]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); @@ -3960,14 +4047,14 @@ } rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - if (off > zp->z_phys->zp_size) { + if (off > zp->z_size) { /* past end of file */ zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } - len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); + len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { @@ -4008,7 +4095,7 @@ int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - if (zp->z_dbuf == NULL) { + if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. @@ -4041,13 +4128,15 @@ if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { - dmu_buf_will_dirty(zp->z_dbuf, tx); mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); @@ -4099,7 +4188,7 @@ * return an error, but we don't worry about races between this * function and zfs_map(). */ - if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { ZFS_EXIT(zfsvfs); return (EAGAIN); } @@ -4312,15 +4401,14 @@ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if ((prot & PROT_WRITE) && - (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | - ZFS_APPENDONLY))) { + if ((prot & PROT_WRITE) && (zp->z_pflags & + (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (EPERM); } if ((prot & (PROT_READ | PROT_EXEC)) && - (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { + (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (EACCES); } @@ -4343,7 +4431,7 @@ /* * If file is locked, disallow mapping. */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { + if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { ZFS_EXIT(zfsvfs); return (EAGAIN); } @@ -4489,13 +4577,19 @@ znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; + uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; - int size, i; + int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - gen = (uint32_t)zp->z_gen; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) + return (error); + + gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; if (fidp->fid_len < size) { @@ -4713,21 +4807,24 @@ */ if (preamble) { /* data begins in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); ASSERT(abuf); (void) dmu_xuio_add(xuio, abuf, blksz - preamble, preamble); } for (i = 0; i < fullblk; i++) { - abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); ASSERT(abuf); (void) dmu_xuio_add(xuio, abuf, 0, blksz); } if (postamble) { /* data ends in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); ASSERT(abuf); (void) dmu_xuio_add(xuio, abuf, 0, postamble); } @@ -4749,7 +4846,7 @@ return (EINVAL); } - maxsize = zp->z_phys->zp_size - uio->uio_loffset; + maxsize = zp->z_size - uio->uio_loffset; if (size > maxsize) size = maxsize; diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/fs/zfs/zfs_znode.c --- a/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c Tue Mar 16 09:43:38 2010 -0600 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #endif /* _KERNEL */ @@ -61,8 +62,11 @@ #include #include #include +#include +#include #include "zfs_prop.h" +#include "zfs_comutil.h" /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only @@ -131,7 +135,6 @@ avl_create(&zp->z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); - zp->z_dbuf = NULL; zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; return (0); @@ -154,7 +157,6 @@ avl_destroy(&zp->z_range_avl); mutex_destroy(&zp->z_range_lock); - ASSERT(zp->z_dbuf == NULL); ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_acl_cached == NULL); } @@ -198,8 +200,15 @@ nzp->z_last_itx = ozp->z_last_itx; nzp->z_gen = ozp->z_gen; nzp->z_sync_cnt = ozp->z_sync_cnt; - nzp->z_phys = ozp->z_phys; - nzp->z_dbuf = ozp->z_dbuf; + nzp->z_is_sa = ozp->z_is_sa; + nzp->z_sa_hdl = ozp->z_sa_hdl; + bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2); + nzp->z_links = ozp->z_links; + nzp->z_size = ozp->z_size; + nzp->z_pflags = ozp->z_pflags; + nzp->z_uid = ozp->z_uid; + nzp->z_gid = ozp->z_gid; + nzp->z_mode = ozp->z_mode; /* * Since this is just an idle znode and kmem is already dealing with @@ -210,9 +219,7 @@ ozp->z_acl_cached = NULL; } - /* Update back pointers. */ - (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, - znode_evict_error); + sa_set_userp(nzp->z_sa_hdl, nzp); /* * Invalidate the original znode by clearing fields that provide a @@ -220,7 +227,7 @@ * ensure that zfs_znode_move() recognizes the znode as invalid in any * subsequent callback. */ - ozp->z_dbuf = NULL; + ozp->z_sa_hdl = NULL; POINTER_INVALIDATE(&ozp->z_zfsvfs); } @@ -475,6 +482,7 @@ sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; vp = ZTOV(sharezp); vn_reinit(vp); @@ -482,8 +490,7 @@ VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids)); - zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, - &zp, 0, &acl_ids); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ POINTER_INVALIDATE(&sharezp->z_zfsvfs); @@ -493,8 +500,7 @@ zfs_acl_ids_free(&acl_ids); ZTOV(sharezp)->v_count = 0; - dmu_buf_rele(sharezp->z_dbuf, NULL); - sharezp->z_dbuf = NULL; + sa_handle_destroy(sharezp->z_sa_hdl); kmem_cache_free(znode_cache, sharezp); return (error); @@ -558,26 +564,25 @@ } static void -zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { - znode_t *nzp; - ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); mutex_enter(&zp->z_lock); - ASSERT(zp->z_dbuf == NULL); + ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); - zp->z_dbuf = db; - nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } - /* - * there should be no - * concurrent zgets on this object. - */ - if (nzp != NULL) - panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* * Slap on VROOT if we are the root znode @@ -592,14 +597,12 @@ void zfs_znode_dmu_fini(znode_t *zp) { - dmu_buf_t *db = zp->z_dbuf; ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || zp->z_unlinked || RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); - ASSERT(zp->z_dbuf != NULL); - zp->z_dbuf = NULL; - VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); - dmu_buf_rele(db, NULL); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; } /* @@ -610,22 +613,27 @@ * return the znode */ static znode_t * -zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; vnode_t *vp; + uint64_t mode; + uint64_t parent; + uint64_t uid, gid; + sa_bulk_attr_t bulk[9]; + int count = 0; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); ASSERT(zp->z_dirlocks == NULL); - ASSERT(zp->z_dbuf == NULL); ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); /* * Defer setting z_zfsvfs until the znode is ready to be a candidate for * the zfs_znode_move() callback. */ - zp->z_phys = NULL; + zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; @@ -638,16 +646,41 @@ vp = ZTOV(zp); vn_reinit(vp); - zfs_znode_dmu_init(zfsvfs, zp, db); + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); - zp->z_gen = zp->z_phys->zp_gen; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, 8); + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + kmem_cache_free(znode_cache, zp); + return (NULL); + } + + zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER); + zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP); + zp->z_mode = mode; vp->v_vfsp = zfsvfs->z_parent->z_vfs; - vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); + + vp->v_type = IFTOVT((mode_t)mode); switch (vp->v_type) { case VDIR: - if (zp->z_phys->zp_flags & ZFS_XATTR) { + if (zp->z_pflags & ZFS_XATTR) { vn_setops(vp, zfs_xdvnodeops); vp->v_flag |= V_XATTRDIR; } else { @@ -657,7 +690,13 @@ break; case VBLK: case VCHR: - vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev); + { + uint64_t rdev; + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), + &rdev, sizeof (rdev)) == 0); + + vp->v_rdev = zfs_cmpldev(rdev); + } /*FALLTHROUGH*/ case VFIFO: case VSOCK: @@ -666,10 +705,12 @@ break; case VREG: vp->v_flag |= VMODSORT; - if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) + if (parent == zfsvfs->z_shares_dir) { + ASSERT(uid == 0 && gid == 0); vn_setops(vp, zfs_sharevnodeops); - else + } else { vn_setops(vp, zfs_fvnodeops); + } break; case VLNK: vn_setops(vp, zfs_symvnodeops); @@ -693,6 +734,9 @@ return (zp); } +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * @@ -712,14 +756,23 @@ */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; - znode_phys_t *pzp; - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; timestruc_t now; uint64_t gen, obj; int err; + int bonuslen; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t sa_attrs[ZPL_END]; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); @@ -733,6 +786,10 @@ gen = dmu_tx_get_txg(tx); } + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE; + /* * Create a new DMU object. */ @@ -746,106 +803,211 @@ if (zfsvfs->z_replay) { err = zap_create_claim_norm(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + obj_type, bonuslen, tx); ASSERT3U(err, ==, 0); } else { obj = zap_create_norm(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + obj_type, bonuslen, tx); } } else { if (zfsvfs->z_replay) { err = dmu_object_claim(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + obj_type, bonuslen, tx); ASSERT3U(err, ==, 0); } else { obj = dmu_object_alloc(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, - DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); + obj_type, bonuslen, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); - dmu_buf_will_dirty(db, tx); - - /* - * Initialize the znode physical data to zero. - */ - ASSERT(db->db_size >= sizeof (znode_phys_t)); - bzero(db->db_data, db->db_size); - pzp = db->db_data; + VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { - dzp->z_dbuf = db; - dzp->z_phys = pzp; dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ - if (dzp->z_phys->zp_flags & ZFS_XATTR) + if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; - - if (vap->va_type == VBLK || vap->va_type == VCHR) { - pzp->zp_rdev = zfs_expldev(vap->va_rdev); } if (zfsvfs->z_use_fuids) - pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; if (vap->va_type == VDIR) { - pzp->zp_size = 2; /* contents ("." and "..") */ - pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); } - pzp->zp_parent = dzp->z_id; + parent = dzp->z_id; + mode = acl_ids->z_mode; if (flag & IS_XATTR) - pzp->zp_flags |= ZFS_XATTR; + pflags |= ZFS_XATTR; - pzp->zp_gen = gen; + /* + * No execs denied will be deterimed when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); - ZFS_TIME_ENCODE(&now, pzp->zp_crtime); - ZFS_TIME_ENCODE(&now, pzp->zp_ctime); + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & AT_ATIME) { - ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); + ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { - ZFS_TIME_ENCODE(&now, pzp->zp_atime); + ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & AT_MTIME) { - ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { - ZFS_TIME_ENCODE(&now, pzp->zp_mtime); + ZFS_TIME_ENCODE(&now, mtime); } - pzp->zp_uid = acl_ids->z_fuid; - pzp->zp_gid = acl_ids->z_fgid; - pzp->zp_mode = acl_ids->z_mode; + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + if (!(flag & IS_ROOT_NODE)) { - *zpp = zfs_znode_alloc(zfsvfs, db, 0); + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + ASSERT(*zpp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; } - VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + if (vap->va_mask & AT_XVATTR) - zfs_xvattr_set(*zpp, (xvattr_t *)vap); + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx); + ASSERT3P(err, ==, 0); + } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } +/* + * zfs_xvattr_set only updates the in-core attributes + * it is assumed the caller will be doing an sa_bulk_update + * to push the changes out + */ void -zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; @@ -853,62 +1015,74 @@ ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { - ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { - ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { - ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { - ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { - ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { - ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { - ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { - ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { - ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, - xoap->xoa_av_quarantined); + xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { - ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { - (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, - sizeof (xoap->xoa_av_scanstamp)); - zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; + zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse); + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } } @@ -920,35 +1094,42 @@ dmu_buf_t *db; znode_t *zp; int err; + sa_handle_t *hdl; *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); - err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_ZNODE || - doi.doi_bonus_size < sizeof (znode_phys_t)) { - dmu_buf_rele(db, NULL); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EINVAL); } - zp = dmu_buf_get_user(db); - if (zp != NULL) { - mutex_enter(&zp->z_lock); + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + /* - * Since we do immediate eviction of the z_dbuf, we - * should never find a dbuf with a znode that doesn't - * know about the dbuf. + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. */ - ASSERT3P(zp->z_dbuf, ==, db); + + ASSERT3P(zp, !=, NULL); + + mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = ENOENT; @@ -957,7 +1138,7 @@ *zpp = zp; err = 0; } - dmu_buf_rele(db, NULL); + sa_buf_rele(db, NULL); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); @@ -969,18 +1150,17 @@ * * There is a small window where zfs_vget() could * find this object while a file create is still in - * progress. Since a gen number can never be zero - * we will check that to determine if its an allocated - * file. + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. */ - - if (((znode_phys_t *)db->db_data)->zp_gen != 0) { - zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = ENOENT; + } else { *zpp = zp; - err = 0; - } else { - dmu_buf_rele(db, NULL); - err = ENOENT; } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); @@ -993,39 +1173,76 @@ dmu_object_info_t doi; dmu_buf_t *db; uint64_t obj_num = zp->z_id; + uint64_t mode; + uint64_t uid, gid; + sa_bulk_attr_t bulk[8]; int err; + int count = 0; + uint64_t gen; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); - err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + mutex_exit(&zp->z_acl_lock); + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_ZNODE || - doi.doi_bonus_size < sizeof (znode_phys_t)) { - dmu_buf_rele(db, NULL); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EINVAL); } - if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { - dmu_buf_rele(db, NULL); + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + zp->z_mode = mode; + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EIO); } - mutex_enter(&zp->z_acl_lock); - if (zp->z_acl_cached) { - zfs_acl_free(zp->z_acl_cached); - zp->z_acl_cached = NULL; + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (EIO); } - mutex_exit(&zp->z_acl_lock); - zfs_znode_dmu_init(zfsvfs, zp, db); - zp->z_unlinked = (zp->z_phys->zp_links == 0); + zp->z_uid = zfs_fuid_map_id(zfsvfs, uid, CRED(), ZFS_OWNER); + zp->z_gid = zfs_fuid_map_id(zfsvfs, gid, CRED(), ZFS_GROUP); + zp->z_unlinked = (zp->z_links == 0); zp->z_blksz = doi.doi_data_block_size; ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); @@ -1039,7 +1256,7 @@ zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; - uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + uint64_t acl_obj = ZFS_EXTERNAL_ACL(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); if (acl_obj) @@ -1057,7 +1274,7 @@ zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; - ASSERT(zp->z_dbuf && zp->z_phys); + ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode @@ -1096,6 +1313,7 @@ zfs_rmnode(zp); return; } + mutex_exit(&zp->z_lock); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); @@ -1127,59 +1345,40 @@ } void -zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) { timestruc_t now; - ASSERT(MUTEX_HELD(&zp->z_lock)); - gethrestime(&now); - if (tx) { - dmu_buf_will_dirty(zp->z_dbuf, tx); + if (have_tx) { /* will sa_bulk_update happen really soon? */ zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } - if (flag & AT_ATIME) - ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); + if (flag & AT_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } if (flag & AT_MTIME) { - ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); - if (zp->z_zfsvfs->z_use_fuids) - zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } } if (flag & AT_CTIME) { - ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); + ZFS_TIME_ENCODE(&now, ctime); if (zp->z_zfsvfs->z_use_fuids) - zp->z_phys->zp_flags |= ZFS_ARCHIVE; + zp->z_pflags |= ZFS_ARCHIVE; } } /* - * Update the requested znode timestamps with the current time. - * If we are in a transaction, then go ahead and mark the znode - * dirty in the transaction so the timestamps will go to disk. - * Otherwise, we will get pushed next time the znode is updated - * in a transaction, or when this znode eventually goes inactive. - * - * Why is this OK? - * 1 - Only the ACCESS time is ever updated outside of a transaction. - * 2 - Multiple consecutive updates will be collapsed into a single - * znode update by the transaction grouping semantics of the DMU. - */ -void -zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) -{ - mutex_enter(&zp->z_lock); - zfs_time_stamper_locked(zp, flag, tx); - mutex_exit(&zp->z_lock); -} - -/* * Grow the block size for a file. * * IN: zp - znode of file to free data in. @@ -1201,17 +1400,18 @@ * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ - if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) + if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); + if (error == ENOTSUP) return; ASSERT3U(error, ==, 0); /* What blocksize did we actually get? */ - dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* @@ -1254,13 +1454,14 @@ /* * Nothing to do if file already at desired length. */ - if (end <= zp->z_phys->zp_size) { + if (end <= zp->z_size) { zfs_range_unlock(rl); return (0); } top: tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* @@ -1288,12 +1489,14 @@ zfs_range_unlock(rl); return (error); } - dmu_buf_will_dirty(zp->z_dbuf, tx); if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); - zp->z_phys->zp_size = end; + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx)); zfs_range_unlock(rl); @@ -1327,13 +1530,13 @@ /* * Nothing to do if file already at desired length. */ - if (off >= zp->z_phys->zp_size) { + if (off >= zp->z_size) { zfs_range_unlock(rl); return (0); } - if (off + len > zp->z_phys->zp_size) - len = zp->z_phys->zp_size - off; + if (off + len > zp->z_size) + len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); @@ -1368,7 +1571,7 @@ /* * Nothing to do if file already at desired length. */ - if (end >= zp->z_phys->zp_size) { + if (end >= zp->z_size) { zfs_range_unlock(rl); return (0); } @@ -1380,7 +1583,8 @@ } top: tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (error == ERESTART) { @@ -1392,9 +1596,11 @@ zfs_range_unlock(rl); return (error); } - dmu_buf_will_dirty(zp->z_dbuf, tx); + + zp->z_size = end; - zp->z_phys->zp_size = end; + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx)); dmu_tx_commit(tx); @@ -1446,9 +1652,17 @@ dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[2]; + int count = 0; int error; - if (off > zp->z_phys->zp_size) { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; @@ -1459,8 +1673,9 @@ /* * Check for any locks in the region to be freed. */ - if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { - uint64_t length = (len ? len : zp->z_phys->zp_size - off); + + if (MANDLOCK(vp, (mode_t)mode)) { + uint64_t length = (len ? len : zp->z_size - off); if (error = chklock(vp, FWRITE, off, length, flag, NULL)) return (error); } @@ -1469,14 +1684,15 @@ error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && - off + len > zp->z_phys->zp_size) + off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (error == ERESTART) { @@ -1488,7 +1704,12 @@ return (error); } - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); @@ -1499,7 +1720,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { zfsvfs_t zfsvfs; - uint64_t moid, obj, version; + uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; @@ -1526,12 +1747,7 @@ /* * Set starting attributes. */ - if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) - version = ZPL_VERSION; - else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) - version = ZPL_VERSION_USERSPACE - 1; - else - version = ZPL_VERSION_FUID - 1; + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ @@ -1557,6 +1773,18 @@ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); @@ -1577,6 +1805,7 @@ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); vp = ZTOV(rootzp); vn_reinit(vp); @@ -1588,7 +1817,11 @@ zfsvfs.z_parent = &zfsvfs; zfsvfs.z_version = version; zfsvfs.z_use_fuids = USE_FUIDS(version, os); + zfsvfs.z_use_sa = USE_SA(version, os); zfsvfs.z_norm = norm; + + zfsvfs.z_attr_table = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END); + /* * Fold case on file systems that are always or sometimes case * insensitive. @@ -1607,7 +1840,7 @@ rootzp->z_zfsvfs = &zfsvfs; VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); - zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); @@ -1616,8 +1849,7 @@ POINTER_INVALIDATE(&rootzp->z_zfsvfs); ZTOV(rootzp)->v_count = 0; - dmu_buf_rele(rootzp->z_dbuf, NULL); - rootzp->z_dbuf = NULL; + sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); /* @@ -1633,33 +1865,59 @@ } #endif /* _KERNEL */ + /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int -zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) +zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir, + sa_attr_type_t *sa_table) { dmu_buf_t *db; dmu_object_info_t doi; - znode_phys_t *zp; int error; + uint64_t parent; + uint64_t pflags; + uint64_t mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *hdl; + int count = 0; - if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) + if ((error = sa_buf_hold(osp, obj, FTAG, &db)) != 0) return (error); dmu_object_info_from_db(db, &doi); - if (doi.doi_bonus_type != DMU_OT_ZNODE || + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)) { - dmu_buf_rele(db, FTAG); + sa_buf_rele(db, FTAG); return (EINVAL); } - zp = db->db_data; - *pobjp = zp->zp_parent; - *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && - S_ISDIR(zp->zp_mode); - dmu_buf_rele(db, FTAG); + if ((error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, + &hdl)) != 0) { + sa_buf_rele(db, FTAG); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], + NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, 8); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) { + sa_buf_rele(db, FTAG); + sa_handle_destroy(hdl); + return (error); + } + *pobjp = parent; + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + sa_handle_destroy(hdl); + sa_buf_rele(db, FTAG); return (0); } @@ -1668,10 +1926,19 @@ zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { char *path = buf + len - 1; + sa_attr_type_t *sa_table; int error; + uint64_t sa_obj = 0; *path = '\0'; + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + + if (error != 0 && error != ENOENT) + return (error); + + sa_table = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END); + for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; @@ -1679,7 +1946,7 @@ int is_xattrdir; if ((error = zfs_obj_to_pobj(osp, obj, &pobj, - &is_xattrdir)) != 0) + &is_xattrdir, sa_table)) != 0) break; if (pobj == obj) { @@ -1707,5 +1974,6 @@ if (error == 0) (void) memmove(buf, path, buf + len - path); + return (error); } diff -r fdae577692c4 -r 538c866aaac6 usr/src/uts/common/sys/fs/zfs.h --- a/usr/src/uts/common/sys/fs/zfs.h Tue Mar 16 06:44:44 2010 -0700 +++ b/usr/src/uts/common/sys/fs/zfs.h Tue Mar 16 09:43:38 2010 -0600 @@ -324,14 +324,15 @@ #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL +#define SPA_VERSION_24 24ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_23 -#define SPA_VERSION_STRING "23" +#define SPA_VERSION SPA_VERSION_24 +#define SPA_VERSION_STRING "24" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -376,6 +377,7 @@ #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 +#define SPA_VERSION_SA SPA_VERSION_24 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -389,8 +391,9 @@ #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL -#define ZPL_VERSION ZPL_VERSION_4 -#define ZPL_VERSION_STRING "4" +#define ZPL_VERSION_5 5ULL +#define ZPL_VERSION ZPL_VERSION_5 +#define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 @@ -398,6 +401,7 @@ #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 +#define ZPL_VERSION_SA ZPL_VERSION_5 /* Rewind request information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */