3977 zones can commit suicide by zvol
Reviewed by: Gordon Ross <
[email protected]>
Reviewed by: Richard Lowe <
[email protected]>
Approved by: Garrett D'Amore <
[email protected]>
--- a/usr/src/cmd/zoneadm/zfs.c Thu Nov 10 09:09:20 2011 +0000
+++ b/usr/src/cmd/zoneadm/zfs.c Mon Jun 11 18:23:37 2012 +0000
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/*
@@ -71,34 +72,6 @@
} clone_data_t;
/*
- * A ZFS file system iterator call-back function which is used to validate
- * datasets imported into the zone.
- */
-/* ARGSUSED */
-static int
-check_zvol(zfs_handle_t *zhp, void *unused)
-{
- int ret;
-
- if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
- /*
- * TRANSLATION_NOTE
- * zfs and dataset are literals that should not be translated.
- */
- (void) fprintf(stderr, gettext("cannot verify zfs dataset %s: "
- "volumes cannot be specified as a zone dataset resource\n"),
- zfs_get_name(zhp));
- ret = -1;
- } else {
- ret = zfs_iter_children(zhp, check_zvol, NULL);
- }
-
- zfs_close(zhp);
-
- return (ret);
-}
-
-/*
* A ZFS file system iterator call-back function which returns the
* zfs_handle_t for a ZFS file system on the specified mount point.
*/
@@ -1259,17 +1232,6 @@
continue;
}
- if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
- (void) fprintf(stderr, gettext("cannot verify zfs "
- "dataset %s: volumes cannot be specified as a "
- "zone dataset resource\n"),
- dstab.zone_dataset_name);
- return_code = Z_ERR;
- }
-
- if (zfs_iter_children(zhp, check_zvol, NULL) != 0)
- return_code = Z_ERR;
-
zfs_close(zhp);
}
(void) zonecfg_enddsent(handle);
--- a/usr/src/uts/common/fs/dev/sdev_subr.c Thu Nov 10 09:09:20 2011 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c Mon Jun 11 18:23:37 2012 +0000
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/*
@@ -537,7 +538,7 @@
SDEV_DYNAMIC | SDEV_VTOR },
{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
- devzvol_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
+ devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c Thu Nov 10 09:09:20 2011 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c Mon Jun 11 18:23:37 2012 +0000
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2013 Joyent, Inc. All rights reserved.
*/
/* vnode ops for the /dev/zvol directory */
@@ -47,6 +48,7 @@
static ldi_handle_t devzvol_lh;
static kmutex_t devzvol_mtx;
static boolean_t devzvol_isopen;
+static major_t devzvol_major;
/*
* we need to use ddi_mod* since fs/dev gets loaded early on in
@@ -61,12 +63,16 @@
int
sdev_zvol_create_minor(char *dsname)
{
+ if (szcm == NULL)
+ return (-1);
return ((*szcm)(dsname));
}
int
sdev_zvol_name2minor(char *dsname, minor_t *minor)
{
+ if (szn2m == NULL)
+ return (-1);
return ((*szn2m)(dsname, minor));
}
@@ -74,6 +80,7 @@
devzvol_open_zfs()
{
int rc;
+ dev_t dv;
devzvol_li = ldi_ident_from_anon();
if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
@@ -94,6 +101,9 @@
cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
return (rc);
}
+ if (ldi_get_dev(devzvol_lh, &dv))
+ return (-1);
+ devzvol_major = getmajor(dv);
return (0);
}
@@ -270,6 +280,8 @@
sdcmn_err13((" v_type %d do_type %d",
SDEVTOV(dv)->v_type, do_type));
if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
+ ((SDEVTOV(dv)->v_type == VBLK || SDEVTOV(dv)->v_type == VCHR) &&
+ do_type != DMU_OST_ZVOL) ||
(SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
kmem_free(dsname, strlen(dsname) + 1);
return (SDEV_VTOR_STALE);
@@ -486,6 +498,82 @@
rw_downgrade(&ddv->sdev_contents);
}
+/*
+ * This function is used to create a dir or dev inside a zone's /dev when the
+ * zone has a zvol that is dynamically created within the zone (i.e. inside
+ * of a delegated dataset. Since there is no /devices tree within a zone,
+ * we create the chr/blk devices directly inside the zone's /dev instead of
+ * making symlinks.
+ */
+static int
+devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
+{
+ struct vattr vattr;
+ timestruc_t now;
+ enum vtype expected_type = VDIR;
+ dmu_objset_type_t do_type;
+ struct sdev_node *dv = NULL;
+ int res;
+ char *dsname;
+
+ bzero(&vattr, sizeof (vattr));
+ gethrestime(&now);
+ vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
+ vattr.va_uid = SDEV_UID_DEFAULT;
+ vattr.va_gid = SDEV_GID_DEFAULT;
+ vattr.va_type = VNON;
+ vattr.va_atime = now;
+ vattr.va_mtime = now;
+ vattr.va_ctime = now;
+
+ if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
+ return (ENOENT);
+
+ if (devzvol_objset_check(dsname, &do_type) != 0) {
+ kmem_free(dsname, strlen(dsname) + 1);
+ return (ENOENT);
+ }
+ if (do_type == DMU_OST_ZVOL)
+ expected_type = VBLK;
+
+ if (expected_type == VDIR) {
+ vattr.va_type = VDIR;
+ vattr.va_mode = SDEV_DIRMODE_DEFAULT;
+ } else {
+ minor_t minor;
+ dev_t devnum;
+ int rc;
+
+ rc = sdev_zvol_create_minor(dsname);
+ if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
+ sdev_zvol_name2minor(dsname, &minor)) {
+ kmem_free(dsname, strlen(dsname) + 1);
+ return (ENOENT);
+ }
+
+ devnum = makedevice(devzvol_major, minor);
+ vattr.va_rdev = devnum;
+
+ if (strstr(parent->sdev_path, "/rdsk/") != NULL)
+ vattr.va_type = VCHR;
+ else
+ vattr.va_type = VBLK;
+ vattr.va_mode = SDEV_DEVMODE_DEFAULT;
+ }
+ kmem_free(dsname, strlen(dsname) + 1);
+
+ rw_enter(&parent->sdev_contents, RW_WRITER);
+
+ res = sdev_mknode(parent, nm, &dv, &vattr,
+ NULL, NULL, kcred, SDEV_READY);
+ rw_exit(&parent->sdev_contents);
+ if (res != 0)
+ return (ENOENT);
+
+ SDEV_RELE(dv);
+ return (0);
+}
+
/*ARGSUSED*/
static int
devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
@@ -505,9 +593,52 @@
return (error);
rw_enter(&parent->sdev_contents, RW_READER);
- if (!SDEV_IS_GLOBAL(parent)) {
+ if (SDEV_IS_GLOBAL(parent)) {
+ /*
+ * During iter_datasets, don't create GZ dev when running in
+ * NGZ. We can't return ENOENT here since that could
+ * incorrectly trigger the creation of the dev from the
+ * recursive call through prof_filldir during iter_datasets.
+ */
+ if (getzoneid() != GLOBAL_ZONEID) {
+ rw_exit(&parent->sdev_contents);
+ return (EPERM);
+ }
+ } else {
+ int res;
+
rw_exit(&parent->sdev_contents);
- return (prof_lookup(dvp, nm, vpp, cred));
+
+ /*
+ * If we're in the global zone and reach down into a non-global
+ * zone's /dev/zvol then this action could trigger the creation
+ * of all of the zvol devices for every zone into the non-global
+ * zone's /dev tree. This could be a big security hole. To
+ * prevent this, disallow the global zone from looking inside
+ * a non-global zones /dev/zvol. This behavior is similar to
+ * delegated datasets, which cannot be used by the global zone.
+ */
+ if (getzoneid() == GLOBAL_ZONEID)
+ return (EPERM);
+
+ res = prof_lookup(dvp, nm, vpp, cred);
+
+ /*
+ * We won't find a zvol that was dynamically created inside
+ * a NGZ, within a delegated dataset, in the zone's dev profile
+ * but prof_lookup will also find it via sdev_cache_lookup.
+ */
+ if (res == ENOENT) {
+ /*
+ * We have to create the sdev node for the dymamically
+ * created zvol.
+ */
+ if (devzvol_mk_ngz_node(parent, nm) != 0)
+ return (ENOENT);
+ res = prof_lookup(dvp, nm, vpp, cred);
+ }
+
+ return (res);
}
dsname = devzvol_make_dsname(parent->sdev_path, nm);
@@ -613,8 +744,10 @@
} else if (rc == ENOENT) {
goto skip;
} else {
- /* EBUSY == problem with zvols's dmu holds? */
- ASSERT(0);
+ /*
+ * EBUSY == problem with zvols's dmu holds?
+ * EPERM when in a NGZ and traversing up and out.
+ */
goto skip;
}
if (arg == ZFS_IOC_DATASET_LIST_NEXT &&