0
|
1 |
/*
|
|
2 |
* CDDL HEADER START
|
|
3 |
*
|
|
4 |
* The contents of this file are subject to the terms of the
|
|
5 |
* Common Development and Distribution License, Version 1.0 only
|
|
6 |
* (the "License"). You may not use this file except in compliance
|
|
7 |
* with the License.
|
|
8 |
*
|
|
9 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
10 |
* or http://www.opensolaris.org/os/licensing.
|
|
11 |
* See the License for the specific language governing permissions
|
|
12 |
* and limitations under the License.
|
|
13 |
*
|
|
14 |
* When distributing Covered Code, include this CDDL HEADER in each
|
|
15 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
16 |
* If applicable, add the following below this CDDL HEADER, with the
|
|
17 |
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
18 |
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
19 |
*
|
|
20 |
* CDDL HEADER END
|
|
21 |
*/
|
390
|
22 |
|
0
|
23 |
/*
|
|
24 |
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
|
|
25 |
* Use is subject to license terms.
|
|
26 |
*/
|
|
27 |
|
|
28 |
#pragma ident "%Z%%M% %I% %E% SMI"
|
|
29 |
|
|
30 |
/*
|
|
31 |
* Zones
|
|
32 |
*
|
|
33 |
* A zone is a named collection of processes, namespace constraints,
|
|
34 |
* and other system resources which comprise a secure and manageable
|
|
35 |
* application containment facility.
|
|
36 |
*
|
|
37 |
* Zones (represented by the reference counted zone_t) are tracked in
|
|
38 |
* the kernel in the zonehash. Elsewhere in the kernel, Zone IDs
|
|
39 |
* (zoneid_t) are used to track zone association. Zone IDs are
|
|
40 |
* dynamically generated when the zone is created; if a persistent
|
|
41 |
* identifier is needed (core files, accounting logs, audit trail,
|
|
42 |
* etc.), the zone name should be used.
|
|
43 |
*
|
|
44 |
*
|
|
45 |
* Global Zone:
|
|
46 |
*
|
|
47 |
* The global zone (zoneid 0) is automatically associated with all
|
|
48 |
* system resources that have not been bound to a user-created zone.
|
|
49 |
* This means that even systems where zones are not in active use
|
|
50 |
* have a global zone, and all processes, mounts, etc. are
|
|
51 |
* associated with that zone. The global zone is generally
|
|
52 |
* unconstrained in terms of privileges and access, though the usual
|
|
53 |
* credential and privilege based restrictions apply.
|
|
54 |
*
|
|
55 |
*
|
|
56 |
* Zone States:
|
|
57 |
*
|
|
58 |
* The states in which a zone may be in and the transitions are as
|
|
59 |
* follows:
|
|
60 |
*
|
|
61 |
* ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
|
|
62 |
* initialized zone is added to the list of active zones on the system but
|
|
63 |
* isn't accessible.
|
|
64 |
*
|
|
65 |
* ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
|
|
66 |
* ready. The zone is made visible after the ZSD constructor callbacks are
|
|
67 |
* executed. A zone remains in this state until it transitions into
|
|
68 |
* the ZONE_IS_BOOTING state as a result of a call to zone_boot().
|
|
69 |
*
|
|
70 |
* ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
|
|
71 |
* init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
|
|
72 |
* state.
|
|
73 |
*
|
|
74 |
* ZONE_IS_RUNNING: The zone is open for business: zsched has
|
|
75 |
* successfully started init. A zone remains in this state until
|
|
76 |
* zone_shutdown() is called.
|
|
77 |
*
|
|
78 |
* ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
|
|
79 |
* killing all processes running in the zone. The zone remains
|
|
80 |
* in this state until there are no more user processes running in the zone.
|
|
81 |
* zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
|
|
82 |
* Since zone_shutdown() is restartable, it may be called successfully
|
|
83 |
* multiple times for the same zone_t. Setting of the zone's state to
|
|
84 |
* ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
|
|
85 |
* the zone's status without worrying about it being a moving target.
|
|
86 |
*
|
|
87 |
* ZONE_IS_EMPTY: zone_shutdown() has been called, and there
|
|
88 |
* are no more user processes in the zone. The zone remains in this
|
|
89 |
* state until there are no more kernel threads associated with the
|
|
90 |
* zone. zone_create(), zone_enter(), and zone_destroy() on this zone will
|
|
91 |
* fail.
|
|
92 |
*
|
|
93 |
* ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
|
|
94 |
* have exited. zone_shutdown() returns. Henceforth it is not possible to
|
|
95 |
* join the zone or create kernel threads therein.
|
|
96 |
*
|
|
97 |
* ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
|
|
98 |
* remains in this state until zsched exits. Calls to zone_find_by_*()
|
|
99 |
* return NULL from now on.
|
|
100 |
*
|
|
101 |
* ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no
|
|
102 |
* processes or threads doing work on behalf of the zone. The zone is
|
|
103 |
* removed from the list of active zones. zone_destroy() returns, and
|
|
104 |
* the zone can be recreated.
|
|
105 |
*
|
|
106 |
* ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
|
|
107 |
* callbacks are executed, and all memory associated with the zone is
|
|
108 |
* freed.
|
|
109 |
*
|
|
110 |
* Threads can wait for the zone to enter a requested state by using
|
|
111 |
* zone_status_wait() or zone_status_timedwait() with the desired
|
|
112 |
* state passed in as an argument. Zone state transitions are
|
|
113 |
* uni-directional; it is not possible to move back to an earlier state.
|
|
114 |
*
|
|
115 |
*
|
|
116 |
* Zone-Specific Data:
|
|
117 |
*
|
|
118 |
* Subsystems needing to maintain zone-specific data can store that
|
|
119 |
* data using the ZSD mechanism. This provides a zone-specific data
|
|
120 |
* store, similar to thread-specific data (see pthread_getspecific(3C)
|
|
121 |
* or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used
|
|
122 |
* to register callbacks to be invoked when a zone is created, shut
|
|
123 |
* down, or destroyed. This can be used to initialize zone-specific
|
|
124 |
* data for new zones and to clean up when zones go away.
|
|
125 |
*
|
|
126 |
*
|
|
127 |
* Data Structures:
|
|
128 |
*
|
|
129 |
* The per-zone structure (zone_t) is reference counted, and freed
|
|
130 |
* when all references are released. zone_hold and zone_rele can be
|
|
131 |
* used to adjust the reference count. In addition, reference counts
|
|
132 |
* associated with the cred_t structure are tracked separately using
|
|
133 |
* zone_cred_hold and zone_cred_rele.
|
|
134 |
*
|
|
135 |
* Pointers to active zone_t's are stored in two hash tables; one
|
|
136 |
* for searching by id, the other for searching by name. Lookups
|
|
137 |
* can be performed on either basis, using zone_find_by_id and
|
|
138 |
* zone_find_by_name. Both return zone_t pointers with the zone
|
|
139 |
* held, so zone_rele should be called when the pointer is no longer
|
|
140 |
* needed. Zones can also be searched by path; zone_find_by_path
|
|
141 |
* returns the zone with which a path name is associated (global
|
|
142 |
* zone if the path is not within some other zone's file system
|
|
143 |
* hierarchy). This currently requires iterating through each zone,
|
|
144 |
* so it is slower than an id or name search via a hash table.
|
|
145 |
*
|
|
146 |
*
|
|
147 |
* Locking:
|
|
148 |
*
|
|
149 |
* zonehash_lock: This is a top-level global lock used to protect the
|
|
150 |
* zone hash tables and lists. Zones cannot be created or destroyed
|
|
151 |
* while this lock is held.
|
|
152 |
* zone_status_lock: This is a global lock protecting zone state.
|
|
153 |
* Zones cannot change state while this lock is held. It also
|
|
154 |
* protects the list of kernel threads associated with a zone.
|
|
155 |
* zone_lock: This is a per-zone lock used to protect several fields of
|
|
156 |
* the zone_t (see <sys/zone.h> for details). In addition, holding
|
|
157 |
* this lock means that the zone cannot go away.
|
|
158 |
* zsd_key_lock: This is a global lock protecting the key state for ZSD.
|
|
159 |
* zone_deathrow_lock: This is a global lock protecting the "deathrow"
|
|
160 |
* list (a list of zones in the ZONE_IS_DEAD state).
|
|
161 |
*
|
|
162 |
* Ordering requirements:
|
|
163 |
* pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
|
|
164 |
* zone_lock --> zsd_key_lock --> pidlock --> p_lock
|
|
165 |
*
|
|
166 |
* Blocking memory allocations are permitted while holding any of the
|
|
167 |
* zone locks.
|
|
168 |
*
|
|
169 |
*
|
|
170 |
* System Call Interface:
|
|
171 |
*
|
|
172 |
* The zone subsystem can be managed and queried from user level with
|
|
173 |
* the following system calls (all subcodes of the primary "zone"
|
|
174 |
* system call):
|
|
175 |
* - zone_create: creates a zone with selected attributes (name,
|
|
176 |
* root path, privileges, resource controls)
|
|
177 |
* - zone_enter: allows the current process to enter a zone
|
|
178 |
* - zone_getattr: reports attributes of a zone
|
|
179 |
* - zone_list: lists all zones active in the system
|
|
180 |
* - zone_lookup: looks up zone id based on name
|
|
181 |
* - zone_shutdown: initiates shutdown process (see states above)
|
|
182 |
* - zone_destroy: completes shutdown process (see states above)
|
|
183 |
*
|
|
184 |
*/
|
|
185 |
|
|
186 |
#include <sys/priv_impl.h>
|
|
187 |
#include <sys/cred.h>
|
|
188 |
#include <c2/audit.h>
|
|
189 |
#include <sys/ddi.h>
|
|
190 |
#include <sys/debug.h>
|
|
191 |
#include <sys/file.h>
|
|
192 |
#include <sys/kmem.h>
|
|
193 |
#include <sys/mutex.h>
|
|
194 |
#include <sys/pathname.h>
|
|
195 |
#include <sys/proc.h>
|
|
196 |
#include <sys/project.h>
|
|
197 |
#include <sys/task.h>
|
|
198 |
#include <sys/systm.h>
|
|
199 |
#include <sys/types.h>
|
|
200 |
#include <sys/utsname.h>
|
|
201 |
#include <sys/vnode.h>
|
|
202 |
#include <sys/vfs.h>
|
|
203 |
#include <sys/systeminfo.h>
|
|
204 |
#include <sys/policy.h>
|
|
205 |
#include <sys/cred_impl.h>
|
|
206 |
#include <sys/contract_impl.h>
|
|
207 |
#include <sys/contract/process_impl.h>
|
|
208 |
#include <sys/class.h>
|
|
209 |
#include <sys/pool.h>
|
|
210 |
#include <sys/pool_pset.h>
|
|
211 |
#include <sys/pset.h>
|
|
212 |
#include <sys/log.h>
|
|
213 |
#include <sys/sysmacros.h>
|
|
214 |
#include <sys/callb.h>
|
|
215 |
#include <sys/vmparam.h>
|
|
216 |
#include <sys/corectl.h>
|
|
217 |
|
|
218 |
#include <sys/door.h>
|
|
219 |
#include <sys/cpuvar.h>
|
|
220 |
#include <sys/fs/snode.h>
|
|
221 |
|
|
222 |
#include <sys/uadmin.h>
|
|
223 |
#include <sys/session.h>
|
|
224 |
#include <sys/cmn_err.h>
|
|
225 |
#include <sys/modhash.h>
|
|
226 |
#include <sys/nvpair.h>
|
|
227 |
#include <sys/rctl.h>
|
|
228 |
#include <sys/fss.h>
|
|
229 |
#include <sys/zone.h>
|
|
230 |
|
|
231 |
/*
|
|
232 |
* cv used to signal that all references to the zone have been released. This
|
|
233 |
* needs to be global since there may be multiple waiters, and the first to
|
|
234 |
* wake up will free the zone_t, hence we cannot use zone->zone_cv.
|
|
235 |
*/
|
|
236 |
static kcondvar_t zone_destroy_cv;
|
|
237 |
/*
|
|
238 |
* Lock used to serialize access to zone_cv. This could have been per-zone,
|
|
239 |
* but then we'd need another lock for zone_destroy_cv, and why bother?
|
|
240 |
*/
|
|
241 |
static kmutex_t zone_status_lock;
|
|
242 |
|
|
243 |
/*
|
|
244 |
* ZSD-related global variables.
|
|
245 |
*/
|
|
246 |
static kmutex_t zsd_key_lock; /* protects the following two */
|
|
247 |
/*
|
|
248 |
* The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
|
|
249 |
*/
|
|
250 |
static zone_key_t zsd_keyval = 0;
|
|
251 |
/*
|
|
252 |
* Global list of registered keys. We use this when a new zone is created.
|
|
253 |
*/
|
|
254 |
static list_t zsd_registered_keys;
|
|
255 |
|
|
256 |
int zone_hash_size = 256;
|
|
257 |
static mod_hash_t *zonehashbyname, *zonehashbyid;
|
|
258 |
static kmutex_t zonehash_lock;
|
|
259 |
static uint_t zonecount;
|
|
260 |
static id_space_t *zoneid_space;
|
|
261 |
|
|
262 |
/*
|
|
263 |
* The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
|
|
264 |
* kernel proper runs, and which manages all other zones.
|
|
265 |
*
|
|
266 |
* Although not declared as static, the variable "zone0" should not be used
|
|
267 |
* except for by code that needs to reference the global zone early on in boot,
|
|
268 |
* before it is fully initialized. All other consumers should use
|
|
269 |
* 'global_zone'.
|
|
270 |
*/
|
|
271 |
zone_t zone0;
|
|
272 |
zone_t *global_zone = NULL; /* Set when the global zone is initialized */
|
|
273 |
|
|
274 |
/*
|
|
275 |
* List of active zones, protected by zonehash_lock.
|
|
276 |
*/
|
|
277 |
static list_t zone_active;
|
|
278 |
|
|
279 |
/*
|
|
280 |
* List of destroyed zones that still have outstanding cred references.
|
|
281 |
* Used for debugging. Uses a separate lock to avoid lock ordering
|
|
282 |
* problems in zone_free.
|
|
283 |
*/
|
|
284 |
static list_t zone_deathrow;
|
|
285 |
static kmutex_t zone_deathrow_lock;
|
|
286 |
|
|
287 |
/* number of zones is limited by virtual interface limit in IP */
|
|
288 |
uint_t maxzones = 8192;
|
|
289 |
|
|
290 |
/*
|
|
291 |
* This isn't static so lint doesn't complain.
|
|
292 |
*/
|
|
293 |
rctl_hndl_t rc_zone_cpu_shares;
|
|
294 |
rctl_hndl_t rc_zone_nlwps;
|
|
295 |
/*
|
|
296 |
* Synchronization primitives used to synchronize between mounts and zone
|
|
297 |
* creation/destruction.
|
|
298 |
*/
|
|
299 |
static int mounts_in_progress;
|
|
300 |
static kcondvar_t mount_cv;
|
|
301 |
static kmutex_t mount_lock;
|
|
302 |
|
|
303 |
const char * const zone_initname = "/sbin/init";
|
|
304 |
|
|
305 |
static int zone_shutdown(zoneid_t zoneid);
|
|
306 |
|
|
307 |
/*
|
|
308 |
* Certain filesystems (such as NFS and autofs) need to know which zone
|
|
309 |
* the mount is being placed in. Because of this, we need to be able to
|
|
310 |
* ensure that a zone isn't in the process of being created such that
|
|
311 |
* nfs_mount() thinks it is in the global zone, while by the time it
|
|
312 |
* gets added the list of mounted zones, it ends up on zoneA's mount
|
|
313 |
* list.
|
|
314 |
*
|
|
315 |
* The following functions: block_mounts()/resume_mounts() and
|
|
316 |
* mount_in_progress()/mount_completed() are used by zones and the VFS
|
|
317 |
* layer (respectively) to synchronize zone creation and new mounts.
|
|
318 |
*
|
|
319 |
* The semantics are like a reader-reader lock such that there may
|
|
320 |
* either be multiple mounts (or zone creations, if that weren't
|
|
321 |
* serialized by zonehash_lock) in progress at the same time, but not
|
|
322 |
* both.
|
|
323 |
*
|
|
324 |
* We use cv's so the user can ctrl-C out of the operation if it's
|
|
325 |
* taking too long.
|
|
326 |
*
|
|
327 |
* The semantics are such that there is unfair bias towards the
|
|
328 |
* "current" operation. This means that zone creations may starve if
|
|
329 |
* there is a rapid succession of new mounts coming in to the system, or
|
|
330 |
* there is a remote possibility that zones will be created at such a
|
|
331 |
* rate that new mounts will not be able to proceed.
|
|
332 |
*/
|
|
333 |
/*
|
|
334 |
* Prevent new mounts from progressing to the point of calling
|
|
335 |
* VFS_MOUNT(). If there are already mounts in this "region", wait for
|
|
336 |
* them to complete.
|
|
337 |
*/
|
|
338 |
static int
|
|
339 |
block_mounts(void)
|
|
340 |
{
|
|
341 |
int retval = 0;
|
|
342 |
|
|
343 |
/*
|
|
344 |
* Since it may block for a long time, block_mounts() shouldn't be
|
|
345 |
* called with zonehash_lock held.
|
|
346 |
*/
|
|
347 |
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
|
|
348 |
mutex_enter(&mount_lock);
|
|
349 |
while (mounts_in_progress > 0) {
|
|
350 |
if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
|
|
351 |
goto signaled;
|
|
352 |
}
|
|
353 |
/*
|
|
354 |
* A negative value of mounts_in_progress indicates that mounts
|
|
355 |
* have been blocked by (-mounts_in_progress) different callers.
|
|
356 |
*/
|
|
357 |
mounts_in_progress--;
|
|
358 |
retval = 1;
|
|
359 |
signaled:
|
|
360 |
mutex_exit(&mount_lock);
|
|
361 |
return (retval);
|
|
362 |
}
|
|
363 |
|
|
364 |
/*
|
|
365 |
* The VFS layer may progress with new mounts as far as we're concerned.
|
|
366 |
* Allow them to progress if we were the last obstacle.
|
|
367 |
*/
|
|
368 |
static void
|
|
369 |
resume_mounts(void)
|
|
370 |
{
|
|
371 |
mutex_enter(&mount_lock);
|
|
372 |
if (++mounts_in_progress == 0)
|
|
373 |
cv_broadcast(&mount_cv);
|
|
374 |
mutex_exit(&mount_lock);
|
|
375 |
}
|
|
376 |
|
|
377 |
/*
|
|
378 |
* The VFS layer is busy with a mount; zones should wait until all
|
|
379 |
* mounts are completed to progress.
|
|
380 |
*/
|
|
381 |
void
|
|
382 |
mount_in_progress(void)
|
|
383 |
{
|
|
384 |
mutex_enter(&mount_lock);
|
|
385 |
while (mounts_in_progress < 0)
|
|
386 |
cv_wait(&mount_cv, &mount_lock);
|
|
387 |
mounts_in_progress++;
|
|
388 |
mutex_exit(&mount_lock);
|
|
389 |
}
|
|
390 |
|
|
391 |
/*
|
|
392 |
* VFS is done with one mount; wake up any waiting block_mounts()
|
|
393 |
* callers if this is the last mount.
|
|
394 |
*/
|
|
395 |
void
|
|
396 |
mount_completed(void)
|
|
397 |
{
|
|
398 |
mutex_enter(&mount_lock);
|
|
399 |
if (--mounts_in_progress == 0)
|
|
400 |
cv_broadcast(&mount_cv);
|
|
401 |
mutex_exit(&mount_lock);
|
|
402 |
}
|
|
403 |
|
|
404 |
/*
|
|
405 |
* ZSD routines.
|
|
406 |
*
|
|
407 |
* Zone Specific Data (ZSD) is modeled after Thread Specific Data as
|
|
408 |
* defined by the pthread_key_create() and related interfaces.
|
|
409 |
*
|
|
410 |
* Kernel subsystems may register one or more data items and/or
|
|
411 |
* callbacks to be executed when a zone is created, shutdown, or
|
|
412 |
* destroyed.
|
|
413 |
*
|
|
414 |
* Unlike the thread counterpart, destructor callbacks will be executed
|
|
415 |
* even if the data pointer is NULL and/or there are no constructor
|
|
416 |
* callbacks, so it is the responsibility of such callbacks to check for
|
|
417 |
* NULL data values if necessary.
|
|
418 |
*
|
|
419 |
* The locking strategy and overall picture is as follows:
|
|
420 |
*
|
|
421 |
* When someone calls zone_key_create(), a template ZSD entry is added to the
|
|
422 |
* global list "zsd_registered_keys", protected by zsd_key_lock. The
|
|
423 |
* constructor callback is called immediately on all existing zones, and a
|
|
424 |
* copy of the ZSD entry added to the per-zone zone_zsd list (protected by
|
|
425 |
* zone_lock). As this operation requires the list of zones, the list of
|
|
426 |
* registered keys, and the per-zone list of ZSD entries to remain constant
|
|
427 |
* throughout the entire operation, it must grab zonehash_lock, zone_lock for
|
|
428 |
* all existing zones, and zsd_key_lock, in that order. Similar locking is
|
|
429 |
* needed when zone_key_delete() is called. It is thus sufficient to hold
|
|
430 |
* zsd_key_lock *or* zone_lock to prevent additions to or removals from the
|
|
431 |
* per-zone zone_zsd list.
|
|
432 |
*
|
|
433 |
* Note that this implementation does not make a copy of the ZSD entry if a
|
|
434 |
* constructor callback is not provided. A zone_getspecific() on such an
|
|
435 |
* uninitialized ZSD entry will return NULL.
|
|
436 |
*
|
|
437 |
* When new zones are created constructor callbacks for all registered ZSD
|
|
438 |
* entries will be called.
|
|
439 |
*
|
|
440 |
* The framework does not provide any locking around zone_getspecific() and
|
|
441 |
* zone_setspecific() apart from that needed for internal consistency, so
|
|
442 |
* callers interested in atomic "test-and-set" semantics will need to provide
|
|
443 |
* their own locking.
|
|
444 |
*/
|
|
445 |
void
|
|
446 |
zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
|
|
447 |
void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
|
|
448 |
{
|
|
449 |
struct zsd_entry *zsdp;
|
|
450 |
struct zsd_entry *t;
|
|
451 |
struct zone *zone;
|
|
452 |
|
|
453 |
zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
|
|
454 |
zsdp->zsd_data = NULL;
|
|
455 |
zsdp->zsd_create = create;
|
|
456 |
zsdp->zsd_shutdown = shutdown;
|
|
457 |
zsdp->zsd_destroy = destroy;
|
|
458 |
|
|
459 |
mutex_enter(&zonehash_lock); /* stop the world */
|
|
460 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
461 |
zone = list_next(&zone_active, zone))
|
|
462 |
mutex_enter(&zone->zone_lock); /* lock all zones */
|
|
463 |
|
|
464 |
mutex_enter(&zsd_key_lock);
|
|
465 |
*keyp = zsdp->zsd_key = ++zsd_keyval;
|
|
466 |
ASSERT(zsd_keyval != 0);
|
|
467 |
list_insert_tail(&zsd_registered_keys, zsdp);
|
|
468 |
mutex_exit(&zsd_key_lock);
|
|
469 |
|
|
470 |
if (create != NULL) {
|
|
471 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
472 |
zone = list_next(&zone_active, zone)) {
|
|
473 |
t = kmem_alloc(sizeof (*t), KM_SLEEP);
|
|
474 |
t->zsd_key = *keyp;
|
|
475 |
t->zsd_data = (*create)(zone->zone_id);
|
|
476 |
t->zsd_create = create;
|
|
477 |
t->zsd_shutdown = shutdown;
|
|
478 |
t->zsd_destroy = destroy;
|
|
479 |
list_insert_tail(&zone->zone_zsd, t);
|
|
480 |
}
|
|
481 |
}
|
|
482 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
483 |
zone = list_next(&zone_active, zone))
|
|
484 |
mutex_exit(&zone->zone_lock);
|
|
485 |
mutex_exit(&zonehash_lock);
|
|
486 |
}
|
|
487 |
|
|
488 |
/*
|
|
489 |
* Helper function to find the zsd_entry associated with the key in the
|
|
490 |
* given list.
|
|
491 |
*/
|
|
492 |
static struct zsd_entry *
|
|
493 |
zsd_find(list_t *l, zone_key_t key)
|
|
494 |
{
|
|
495 |
struct zsd_entry *zsd;
|
|
496 |
|
|
497 |
for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
|
|
498 |
if (zsd->zsd_key == key) {
|
|
499 |
/*
|
|
500 |
* Move to head of list to keep list in MRU order.
|
|
501 |
*/
|
|
502 |
if (zsd != list_head(l)) {
|
|
503 |
list_remove(l, zsd);
|
|
504 |
list_insert_head(l, zsd);
|
|
505 |
}
|
|
506 |
return (zsd);
|
|
507 |
}
|
|
508 |
}
|
|
509 |
return (NULL);
|
|
510 |
}
|
|
511 |
|
|
512 |
/*
|
|
513 |
* Function called when a module is being unloaded, or otherwise wishes
|
|
514 |
* to unregister its ZSD key and callbacks.
|
|
515 |
*/
|
|
516 |
int
|
|
517 |
zone_key_delete(zone_key_t key)
|
|
518 |
{
|
|
519 |
struct zsd_entry *zsdp = NULL;
|
|
520 |
zone_t *zone;
|
|
521 |
|
|
522 |
mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */
|
|
523 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
524 |
zone = list_next(&zone_active, zone))
|
|
525 |
mutex_enter(&zone->zone_lock); /* lock all zones */
|
|
526 |
|
|
527 |
mutex_enter(&zsd_key_lock);
|
|
528 |
zsdp = zsd_find(&zsd_registered_keys, key);
|
|
529 |
if (zsdp == NULL)
|
|
530 |
goto notfound;
|
|
531 |
list_remove(&zsd_registered_keys, zsdp);
|
|
532 |
mutex_exit(&zsd_key_lock);
|
|
533 |
|
|
534 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
535 |
zone = list_next(&zone_active, zone)) {
|
|
536 |
struct zsd_entry *del;
|
|
537 |
void *data;
|
|
538 |
|
|
539 |
if (!(zone->zone_flags & ZF_DESTROYED)) {
|
|
540 |
del = zsd_find(&zone->zone_zsd, key);
|
|
541 |
if (del != NULL) {
|
|
542 |
data = del->zsd_data;
|
|
543 |
ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
|
|
544 |
ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
|
|
545 |
list_remove(&zone->zone_zsd, del);
|
|
546 |
kmem_free(del, sizeof (*del));
|
|
547 |
} else {
|
|
548 |
data = NULL;
|
|
549 |
}
|
|
550 |
if (zsdp->zsd_shutdown)
|
|
551 |
zsdp->zsd_shutdown(zone->zone_id, data);
|
|
552 |
if (zsdp->zsd_destroy)
|
|
553 |
zsdp->zsd_destroy(zone->zone_id, data);
|
|
554 |
}
|
|
555 |
mutex_exit(&zone->zone_lock);
|
|
556 |
}
|
|
557 |
mutex_exit(&zonehash_lock);
|
|
558 |
kmem_free(zsdp, sizeof (*zsdp));
|
|
559 |
return (0);
|
|
560 |
|
|
561 |
notfound:
|
|
562 |
mutex_exit(&zsd_key_lock);
|
|
563 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
564 |
zone = list_next(&zone_active, zone))
|
|
565 |
mutex_exit(&zone->zone_lock);
|
|
566 |
mutex_exit(&zonehash_lock);
|
|
567 |
return (-1);
|
|
568 |
}
|
|
569 |
|
|
570 |
/*
|
|
571 |
* ZSD counterpart of pthread_setspecific().
|
|
572 |
*/
|
|
573 |
int
|
|
574 |
zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
|
|
575 |
{
|
|
576 |
struct zsd_entry *t;
|
|
577 |
struct zsd_entry *zsdp = NULL;
|
|
578 |
|
|
579 |
mutex_enter(&zone->zone_lock);
|
|
580 |
t = zsd_find(&zone->zone_zsd, key);
|
|
581 |
if (t != NULL) {
|
|
582 |
/*
|
|
583 |
* Replace old value with new
|
|
584 |
*/
|
|
585 |
t->zsd_data = (void *)data;
|
|
586 |
mutex_exit(&zone->zone_lock);
|
|
587 |
return (0);
|
|
588 |
}
|
|
589 |
/*
|
|
590 |
* If there was no previous value, go through the list of registered
|
|
591 |
* keys.
|
|
592 |
*
|
|
593 |
* We avoid grabbing zsd_key_lock until we are sure we need it; this is
|
|
594 |
* necessary for shutdown callbacks to be able to execute without fear
|
|
595 |
* of deadlock.
|
|
596 |
*/
|
|
597 |
mutex_enter(&zsd_key_lock);
|
|
598 |
zsdp = zsd_find(&zsd_registered_keys, key);
|
|
599 |
if (zsdp == NULL) { /* Key was not registered */
|
|
600 |
mutex_exit(&zsd_key_lock);
|
|
601 |
mutex_exit(&zone->zone_lock);
|
|
602 |
return (-1);
|
|
603 |
}
|
|
604 |
|
|
605 |
/*
|
|
606 |
* Add a zsd_entry to this zone, using the template we just retrieved
|
|
607 |
* to initialize the constructor and destructor(s).
|
|
608 |
*/
|
|
609 |
t = kmem_alloc(sizeof (*t), KM_SLEEP);
|
|
610 |
t->zsd_key = key;
|
|
611 |
t->zsd_data = (void *)data;
|
|
612 |
t->zsd_create = zsdp->zsd_create;
|
|
613 |
t->zsd_shutdown = zsdp->zsd_shutdown;
|
|
614 |
t->zsd_destroy = zsdp->zsd_destroy;
|
|
615 |
list_insert_tail(&zone->zone_zsd, t);
|
|
616 |
mutex_exit(&zsd_key_lock);
|
|
617 |
mutex_exit(&zone->zone_lock);
|
|
618 |
return (0);
|
|
619 |
}
|
|
620 |
|
|
621 |
/*
|
|
622 |
* ZSD counterpart of pthread_getspecific().
|
|
623 |
*/
|
|
624 |
void *
|
|
625 |
zone_getspecific(zone_key_t key, zone_t *zone)
|
|
626 |
{
|
|
627 |
struct zsd_entry *t;
|
|
628 |
void *data;
|
|
629 |
|
|
630 |
mutex_enter(&zone->zone_lock);
|
|
631 |
t = zsd_find(&zone->zone_zsd, key);
|
|
632 |
data = (t == NULL ? NULL : t->zsd_data);
|
|
633 |
mutex_exit(&zone->zone_lock);
|
|
634 |
return (data);
|
|
635 |
}
|
|
636 |
|
|
637 |
/*
|
|
638 |
* Function used to initialize a zone's list of ZSD callbacks and data
|
|
639 |
* when the zone is being created. The callbacks are initialized from
|
|
640 |
* the template list (zsd_registered_keys), and the constructor
|
|
641 |
* callback executed (if one exists).
|
|
642 |
*
|
|
643 |
* This is called before the zone is made publicly available, hence no
|
|
644 |
* need to grab zone_lock.
|
|
645 |
*
|
|
646 |
* Although we grab and release zsd_key_lock, new entries cannot be
|
|
647 |
* added to or removed from the zsd_registered_keys list until we
|
|
648 |
* release zonehash_lock, so there isn't a window for a
|
|
649 |
* zone_key_create() to come in after we've dropped zsd_key_lock but
|
|
650 |
* before the zone is added to the zone list, such that the constructor
|
|
651 |
* callbacks aren't executed for the new zone.
|
|
652 |
*/
|
|
653 |
static void
|
|
654 |
zone_zsd_configure(zone_t *zone)
|
|
655 |
{
|
|
656 |
struct zsd_entry *zsdp;
|
|
657 |
struct zsd_entry *t;
|
|
658 |
zoneid_t zoneid = zone->zone_id;
|
|
659 |
|
|
660 |
ASSERT(MUTEX_HELD(&zonehash_lock));
|
|
661 |
ASSERT(list_head(&zone->zone_zsd) == NULL);
|
|
662 |
mutex_enter(&zsd_key_lock);
|
|
663 |
for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
|
|
664 |
zsdp = list_next(&zsd_registered_keys, zsdp)) {
|
|
665 |
if (zsdp->zsd_create != NULL) {
|
|
666 |
t = kmem_alloc(sizeof (*t), KM_SLEEP);
|
|
667 |
t->zsd_key = zsdp->zsd_key;
|
|
668 |
t->zsd_create = zsdp->zsd_create;
|
|
669 |
t->zsd_data = (*t->zsd_create)(zoneid);
|
|
670 |
t->zsd_shutdown = zsdp->zsd_shutdown;
|
|
671 |
t->zsd_destroy = zsdp->zsd_destroy;
|
|
672 |
list_insert_tail(&zone->zone_zsd, t);
|
|
673 |
}
|
|
674 |
}
|
|
675 |
mutex_exit(&zsd_key_lock);
|
|
676 |
}
|
|
677 |
|
|
678 |
enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
|
|
679 |
|
|
680 |
/*
|
|
681 |
* Helper function to execute shutdown or destructor callbacks.
|
|
682 |
*/
|
|
683 |
static void
|
|
684 |
zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
|
|
685 |
{
|
|
686 |
struct zsd_entry *zsdp;
|
|
687 |
struct zsd_entry *t;
|
|
688 |
zoneid_t zoneid = zone->zone_id;
|
|
689 |
|
|
690 |
ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
|
|
691 |
ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
|
|
692 |
ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
|
|
693 |
|
|
694 |
mutex_enter(&zone->zone_lock);
|
|
695 |
if (ct == ZSD_DESTROY) {
|
|
696 |
if (zone->zone_flags & ZF_DESTROYED) {
|
|
697 |
/*
|
|
698 |
* Make sure destructors are only called once.
|
|
699 |
*/
|
|
700 |
mutex_exit(&zone->zone_lock);
|
|
701 |
return;
|
|
702 |
}
|
|
703 |
zone->zone_flags |= ZF_DESTROYED;
|
|
704 |
}
|
|
705 |
mutex_exit(&zone->zone_lock);
|
|
706 |
|
|
707 |
/*
|
|
708 |
* Both zsd_key_lock and zone_lock need to be held in order to add or
|
|
709 |
* remove a ZSD key, (either globally as part of
|
|
710 |
* zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
|
|
711 |
* possible through zone_setspecific()), so it's sufficient to hold
|
|
712 |
* zsd_key_lock here.
|
|
713 |
*
|
|
714 |
* This is a good thing, since we don't want to recursively try to grab
|
|
715 |
* zone_lock if a callback attempts to do something like a crfree() or
|
|
716 |
* zone_rele().
|
|
717 |
*/
|
|
718 |
mutex_enter(&zsd_key_lock);
|
|
719 |
for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
|
|
720 |
zsdp = list_next(&zsd_registered_keys, zsdp)) {
|
|
721 |
zone_key_t key = zsdp->zsd_key;
|
|
722 |
|
|
723 |
/* Skip if no callbacks registered */
|
|
724 |
if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
|
|
725 |
continue;
|
|
726 |
if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
|
|
727 |
continue;
|
|
728 |
/*
|
|
729 |
* Call the callback with the zone-specific data if we can find
|
|
730 |
* any, otherwise with NULL.
|
|
731 |
*/
|
|
732 |
t = zsd_find(&zone->zone_zsd, key);
|
|
733 |
if (t != NULL) {
|
|
734 |
if (ct == ZSD_SHUTDOWN) {
|
|
735 |
t->zsd_shutdown(zoneid, t->zsd_data);
|
|
736 |
} else {
|
|
737 |
ASSERT(ct == ZSD_DESTROY);
|
|
738 |
t->zsd_destroy(zoneid, t->zsd_data);
|
|
739 |
}
|
|
740 |
} else {
|
|
741 |
if (ct == ZSD_SHUTDOWN) {
|
|
742 |
zsdp->zsd_shutdown(zoneid, NULL);
|
|
743 |
} else {
|
|
744 |
ASSERT(ct == ZSD_DESTROY);
|
|
745 |
zsdp->zsd_destroy(zoneid, NULL);
|
|
746 |
}
|
|
747 |
}
|
|
748 |
}
|
|
749 |
mutex_exit(&zsd_key_lock);
|
|
750 |
}
|
|
751 |
|
|
752 |
/*
|
|
753 |
* Called when the zone is going away; free ZSD-related memory, and
|
|
754 |
* destroy the zone_zsd list.
|
|
755 |
*/
|
|
756 |
static void
|
|
757 |
zone_free_zsd(zone_t *zone)
|
|
758 |
{
|
|
759 |
struct zsd_entry *t, *next;
|
|
760 |
|
|
761 |
/*
|
|
762 |
* Free all the zsd_entry's we had on this zone.
|
|
763 |
*/
|
|
764 |
for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
|
|
765 |
next = list_next(&zone->zone_zsd, t);
|
|
766 |
list_remove(&zone->zone_zsd, t);
|
|
767 |
kmem_free(t, sizeof (*t));
|
|
768 |
}
|
|
769 |
list_destroy(&zone->zone_zsd);
|
|
770 |
}
|
|
771 |
|
|
772 |
/*
|
|
773 |
* zone.cpu-shares resource control support.
|
|
774 |
*/
|
|
775 |
/*ARGSUSED*/
|
|
776 |
static rctl_qty_t
|
|
777 |
zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
|
|
778 |
{
|
|
779 |
ASSERT(MUTEX_HELD(&p->p_lock));
|
|
780 |
return (p->p_zone->zone_shares);
|
|
781 |
}
|
|
782 |
|
|
783 |
/*ARGSUSED*/
|
|
784 |
static int
|
|
785 |
zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
|
|
786 |
rctl_qty_t nv)
|
|
787 |
{
|
|
788 |
ASSERT(MUTEX_HELD(&p->p_lock));
|
|
789 |
ASSERT(e->rcep_t == RCENTITY_ZONE);
|
|
790 |
if (e->rcep_p.zone == NULL)
|
|
791 |
return (0);
|
|
792 |
|
|
793 |
e->rcep_p.zone->zone_shares = nv;
|
|
794 |
return (0);
|
|
795 |
}
|
|
796 |
|
|
797 |
static rctl_ops_t zone_cpu_shares_ops = {
|
|
798 |
rcop_no_action,
|
|
799 |
zone_cpu_shares_usage,
|
|
800 |
zone_cpu_shares_set,
|
|
801 |
rcop_no_test
|
|
802 |
};
|
|
803 |
|
|
804 |
/*ARGSUSED*/
|
|
805 |
static rctl_qty_t
|
|
806 |
zone_lwps_usage(rctl_t *r, proc_t *p)
|
|
807 |
{
|
|
808 |
rctl_qty_t nlwps;
|
|
809 |
zone_t *zone = p->p_zone;
|
|
810 |
|
|
811 |
ASSERT(MUTEX_HELD(&p->p_lock));
|
|
812 |
|
|
813 |
mutex_enter(&zone->zone_nlwps_lock);
|
|
814 |
nlwps = zone->zone_nlwps;
|
|
815 |
mutex_exit(&zone->zone_nlwps_lock);
|
|
816 |
|
|
817 |
return (nlwps);
|
|
818 |
}
|
|
819 |
|
|
820 |
/*ARGSUSED*/
|
|
821 |
static int
|
|
822 |
zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
|
|
823 |
rctl_qty_t incr, uint_t flags)
|
|
824 |
{
|
|
825 |
rctl_qty_t nlwps;
|
|
826 |
|
|
827 |
ASSERT(MUTEX_HELD(&p->p_lock));
|
|
828 |
ASSERT(e->rcep_t == RCENTITY_ZONE);
|
|
829 |
if (e->rcep_p.zone == NULL)
|
|
830 |
return (0);
|
|
831 |
ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
|
|
832 |
nlwps = e->rcep_p.zone->zone_nlwps;
|
|
833 |
|
|
834 |
if (nlwps + incr > rcntl->rcv_value)
|
|
835 |
return (1);
|
|
836 |
|
|
837 |
return (0);
|
|
838 |
}
|
|
839 |
|
|
840 |
/*ARGSUSED*/
|
|
841 |
static int
|
|
842 |
zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) {
|
|
843 |
|
|
844 |
ASSERT(MUTEX_HELD(&p->p_lock));
|
|
845 |
ASSERT(e->rcep_t == RCENTITY_ZONE);
|
|
846 |
if (e->rcep_p.zone == NULL)
|
|
847 |
return (0);
|
|
848 |
e->rcep_p.zone->zone_nlwps_ctl = nv;
|
|
849 |
return (0);
|
|
850 |
}
|
|
851 |
|
|
852 |
static rctl_ops_t zone_lwps_ops = {
|
|
853 |
rcop_no_action,
|
|
854 |
zone_lwps_usage,
|
|
855 |
zone_lwps_set,
|
|
856 |
zone_lwps_test,
|
|
857 |
};
|
|
858 |
|
|
859 |
/*
|
|
860 |
* Helper function to brand the zone with a unique ID.
|
|
861 |
*/
|
|
862 |
static void
|
|
863 |
zone_uniqid(zone_t *zone)
|
|
864 |
{
|
|
865 |
static uint64_t uniqid = 0;
|
|
866 |
|
|
867 |
ASSERT(MUTEX_HELD(&zonehash_lock));
|
|
868 |
zone->zone_uniqid = uniqid++;
|
|
869 |
}
|
|
870 |
|
|
871 |
/*
|
|
872 |
* Returns a held pointer to the "kcred" for the specified zone.
|
|
873 |
*/
|
|
874 |
struct cred *
|
|
875 |
zone_get_kcred(zoneid_t zoneid)
|
|
876 |
{
|
|
877 |
zone_t *zone;
|
|
878 |
cred_t *cr;
|
|
879 |
|
|
880 |
if ((zone = zone_find_by_id(zoneid)) == NULL)
|
|
881 |
return (NULL);
|
|
882 |
cr = zone->zone_kcred;
|
|
883 |
crhold(cr);
|
|
884 |
zone_rele(zone);
|
|
885 |
return (cr);
|
|
886 |
}
|
|
887 |
|
|
888 |
/*
|
|
889 |
* Called very early on in boot to initialize the ZSD list so that
|
|
890 |
* zone_key_create() can be called before zone_init(). It also initializes
|
|
891 |
* portions of zone0 which may be used before zone_init() is called. The
|
|
892 |
* variable "global_zone" will be set when zone0 is fully initialized by
|
|
893 |
* zone_init().
|
|
894 |
*/
|
|
895 |
void
|
|
896 |
zone_zsd_init(void)
|
|
897 |
{
|
|
898 |
mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
899 |
mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
900 |
list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
|
|
901 |
offsetof(struct zsd_entry, zsd_linkage));
|
|
902 |
list_create(&zone_active, sizeof (zone_t),
|
|
903 |
offsetof(zone_t, zone_linkage));
|
|
904 |
list_create(&zone_deathrow, sizeof (zone_t),
|
|
905 |
offsetof(zone_t, zone_linkage));
|
|
906 |
|
|
907 |
mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
908 |
mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
909 |
zone0.zone_shares = 1;
|
|
910 |
zone0.zone_nlwps_ctl = INT_MAX;
|
|
911 |
zone0.zone_name = GLOBAL_ZONENAME;
|
|
912 |
zone0.zone_nodename = utsname.nodename;
|
|
913 |
zone0.zone_domain = srpc_domain;
|
|
914 |
zone0.zone_ref = 1;
|
|
915 |
zone0.zone_id = GLOBAL_ZONEID;
|
|
916 |
zone0.zone_status = ZONE_IS_RUNNING;
|
|
917 |
zone0.zone_rootpath = "/";
|
|
918 |
zone0.zone_rootpathlen = 2;
|
|
919 |
zone0.zone_psetid = ZONE_PS_INVAL;
|
|
920 |
zone0.zone_ncpus = 0;
|
|
921 |
zone0.zone_ncpus_online = 0;
|
|
922 |
zone0.zone_proc_initpid = 1;
|
|
923 |
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
|
|
924 |
offsetof(struct zsd_entry, zsd_linkage));
|
|
925 |
list_insert_head(&zone_active, &zone0);
|
|
926 |
|
|
927 |
/*
|
|
928 |
* The root filesystem is not mounted yet, so zone_rootvp cannot be set
|
|
929 |
* to anything meaningful. It is assigned to be 'rootdir' in
|
|
930 |
* vfs_mountroot().
|
|
931 |
*/
|
|
932 |
zone0.zone_rootvp = NULL;
|
|
933 |
zone0.zone_vfslist = NULL;
|
|
934 |
zone0.zone_bootargs = NULL;
|
|
935 |
zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
|
|
936 |
/*
|
|
937 |
* The global zone has all privileges
|
|
938 |
*/
|
|
939 |
priv_fillset(zone0.zone_privset);
|
|
940 |
/*
|
|
941 |
* Add p0 to the global zone
|
|
942 |
*/
|
|
943 |
zone0.zone_zsched = &p0;
|
|
944 |
p0.p_zone = &zone0;
|
|
945 |
}
|
|
946 |
|
|
947 |
/*
|
|
948 |
* Called by main() to initialize the zones framework.
|
|
949 |
*/
|
|
950 |
void
|
|
951 |
zone_init(void)
|
|
952 |
{
|
|
953 |
rctl_dict_entry_t *rde;
|
|
954 |
rctl_val_t *dval;
|
|
955 |
rctl_set_t *set;
|
|
956 |
rctl_alloc_gp_t *gp;
|
|
957 |
rctl_entity_p_t e;
|
|
958 |
|
|
959 |
ASSERT(curproc == &p0);
|
|
960 |
|
|
961 |
/*
|
|
962 |
* Create ID space for zone IDs. ID 0 is reserved for the
|
|
963 |
* global zone.
|
|
964 |
*/
|
|
965 |
zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
|
|
966 |
|
|
967 |
/*
|
|
968 |
* Initialize generic zone resource controls, if any.
|
|
969 |
*/
|
|
970 |
rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
|
|
971 |
RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
|
|
972 |
RCTL_GLOBAL_NOBASIC |
|
|
973 |
RCTL_GLOBAL_COUNT, FSS_MAXSHARES, FSS_MAXSHARES,
|
|
974 |
&zone_cpu_shares_ops);
|
|
975 |
|
|
976 |
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
|
|
977 |
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
|
|
978 |
INT_MAX, INT_MAX, &zone_lwps_ops);
|
|
979 |
/*
|
|
980 |
* Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
|
|
981 |
* this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
|
|
982 |
*/
|
|
983 |
dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
|
|
984 |
bzero(dval, sizeof (rctl_val_t));
|
|
985 |
dval->rcv_value = 1;
|
|
986 |
dval->rcv_privilege = RCPRIV_PRIVILEGED;
|
|
987 |
dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
|
|
988 |
dval->rcv_action_recip_pid = -1;
|
|
989 |
|
|
990 |
rde = rctl_dict_lookup("zone.cpu-shares");
|
|
991 |
(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
|
|
992 |
|
|
993 |
/*
|
|
994 |
* Initialize the ``global zone''.
|
|
995 |
*/
|
|
996 |
set = rctl_set_create();
|
|
997 |
gp = rctl_set_init_prealloc(RCENTITY_ZONE);
|
|
998 |
mutex_enter(&p0.p_lock);
|
|
999 |
e.rcep_p.zone = &zone0;
|
|
1000 |
e.rcep_t = RCENTITY_ZONE;
|
|
1001 |
zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
|
|
1002 |
gp);
|
|
1003 |
|
|
1004 |
zone0.zone_nlwps = p0.p_lwpcnt;
|
|
1005 |
zone0.zone_ntasks = 1;
|
|
1006 |
mutex_exit(&p0.p_lock);
|
|
1007 |
rctl_prealloc_destroy(gp);
|
|
1008 |
/*
|
|
1009 |
* pool_default hasn't been initialized yet, so we let pool_init() take
|
|
1010 |
* care of making the global zone is in the default pool.
|
|
1011 |
*/
|
|
1012 |
mutex_enter(&zonehash_lock);
|
|
1013 |
zone_uniqid(&zone0);
|
|
1014 |
ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
|
|
1015 |
mutex_exit(&zonehash_lock);
|
|
1016 |
zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
|
|
1017 |
mod_hash_null_valdtor);
|
|
1018 |
zonehashbyname = mod_hash_create_strhash("zone_by_name",
|
|
1019 |
zone_hash_size, mod_hash_null_valdtor);
|
|
1020 |
zonecount = 1;
|
|
1021 |
|
|
1022 |
(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
|
|
1023 |
(mod_hash_val_t)&zone0);
|
|
1024 |
(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
|
|
1025 |
(mod_hash_val_t)&zone0);
|
|
1026 |
/*
|
|
1027 |
* We avoid setting zone_kcred until now, since kcred is initialized
|
|
1028 |
* sometime after zone_zsd_init() and before zone_init().
|
|
1029 |
*/
|
|
1030 |
zone0.zone_kcred = kcred;
|
|
1031 |
/*
|
|
1032 |
* The global zone is fully initialized (except for zone_rootvp which
|
|
1033 |
* will be set when the root filesystem is mounted).
|
|
1034 |
*/
|
|
1035 |
global_zone = &zone0;
|
|
1036 |
}
|
|
1037 |
|
|
1038 |
static void
|
|
1039 |
zone_free(zone_t *zone)
|
|
1040 |
{
|
|
1041 |
ASSERT(zone != global_zone);
|
|
1042 |
ASSERT(zone->zone_ntasks == 0);
|
|
1043 |
ASSERT(zone->zone_nlwps == 0);
|
|
1044 |
ASSERT(zone->zone_cred_ref == 0);
|
|
1045 |
ASSERT(zone->zone_kcred == NULL);
|
|
1046 |
ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
|
|
1047 |
zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
|
|
1048 |
|
|
1049 |
/* remove from deathrow list */
|
|
1050 |
if (zone_status_get(zone) == ZONE_IS_DEAD) {
|
|
1051 |
ASSERT(zone->zone_ref == 0);
|
|
1052 |
mutex_enter(&zone_deathrow_lock);
|
|
1053 |
list_remove(&zone_deathrow, zone);
|
|
1054 |
mutex_exit(&zone_deathrow_lock);
|
|
1055 |
}
|
|
1056 |
|
|
1057 |
zone_free_zsd(zone);
|
|
1058 |
|
|
1059 |
if (zone->zone_rootvp != NULL)
|
|
1060 |
VN_RELE(zone->zone_rootvp);
|
|
1061 |
if (zone->zone_rootpath)
|
|
1062 |
kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
|
|
1063 |
if (zone->zone_name != NULL)
|
|
1064 |
kmem_free(zone->zone_name, ZONENAME_MAX);
|
|
1065 |
if (zone->zone_nodename != NULL)
|
|
1066 |
kmem_free(zone->zone_nodename, _SYS_NMLN);
|
|
1067 |
if (zone->zone_domain != NULL)
|
|
1068 |
kmem_free(zone->zone_domain, _SYS_NMLN);
|
|
1069 |
if (zone->zone_privset != NULL)
|
|
1070 |
kmem_free(zone->zone_privset, sizeof (priv_set_t));
|
|
1071 |
if (zone->zone_rctls != NULL)
|
|
1072 |
rctl_set_free(zone->zone_rctls);
|
|
1073 |
if (zone->zone_bootargs != NULL)
|
|
1074 |
kmem_free(zone->zone_bootargs, ZONEBOOTARGS_MAX);
|
|
1075 |
id_free(zoneid_space, zone->zone_id);
|
|
1076 |
mutex_destroy(&zone->zone_lock);
|
|
1077 |
cv_destroy(&zone->zone_cv);
|
|
1078 |
kmem_free(zone, sizeof (zone_t));
|
|
1079 |
}
|
|
1080 |
|
|
1081 |
/*
|
|
1082 |
* See block comment at the top of this file for information about zone
|
|
1083 |
* status values.
|
|
1084 |
*/
|
|
1085 |
/*
|
|
1086 |
* Convenience function for setting zone status.
|
|
1087 |
*/
|
|
1088 |
static void
|
|
1089 |
zone_status_set(zone_t *zone, zone_status_t status)
|
|
1090 |
{
|
|
1091 |
ASSERT(MUTEX_HELD(&zone_status_lock));
|
|
1092 |
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
|
|
1093 |
status >= zone_status_get(zone));
|
|
1094 |
zone->zone_status = status;
|
|
1095 |
cv_broadcast(&zone->zone_cv);
|
|
1096 |
}
|
|
1097 |
|
|
1098 |
/*
|
|
1099 |
* Public function to retrieve the zone status. The zone status may
|
|
1100 |
* change after it is retrieved.
|
|
1101 |
*/
|
|
1102 |
zone_status_t
|
|
1103 |
zone_status_get(zone_t *zone)
|
|
1104 |
{
|
|
1105 |
return (zone->zone_status);
|
|
1106 |
}
|
|
1107 |
|
|
1108 |
static int
|
|
1109 |
zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
|
|
1110 |
{
|
|
1111 |
char *bootargs = kmem_zalloc(ZONEBOOTARGS_MAX, KM_SLEEP);
|
|
1112 |
size_t len;
|
|
1113 |
int err;
|
|
1114 |
|
|
1115 |
err = copyinstr(zone_bootargs, bootargs, ZONEBOOTARGS_MAX - 1, &len);
|
|
1116 |
if (err != 0) {
|
|
1117 |
kmem_free(bootargs, ZONEBOOTARGS_MAX);
|
|
1118 |
return (err); /* EFAULT or ENAMETOOLONG */
|
|
1119 |
}
|
|
1120 |
bootargs[len] = '\0';
|
|
1121 |
|
|
1122 |
ASSERT(zone->zone_bootargs == NULL);
|
|
1123 |
zone->zone_bootargs = bootargs;
|
|
1124 |
return (0);
|
|
1125 |
}
|
|
1126 |
|
|
1127 |
/*
|
|
1128 |
* Block indefinitely waiting for (zone_status >= status)
|
|
1129 |
*/
|
|
1130 |
void
|
|
1131 |
zone_status_wait(zone_t *zone, zone_status_t status)
|
|
1132 |
{
|
|
1133 |
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
|
|
1134 |
|
|
1135 |
mutex_enter(&zone_status_lock);
|
|
1136 |
while (zone->zone_status < status) {
|
|
1137 |
cv_wait(&zone->zone_cv, &zone_status_lock);
|
|
1138 |
}
|
|
1139 |
mutex_exit(&zone_status_lock);
|
|
1140 |
}
|
|
1141 |
|
|
1142 |
/*
|
|
1143 |
* Private CPR-safe version of zone_status_wait().
|
|
1144 |
*/
|
|
1145 |
static void
|
|
1146 |
zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
|
|
1147 |
{
|
|
1148 |
callb_cpr_t cprinfo;
|
|
1149 |
|
|
1150 |
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
|
|
1151 |
|
|
1152 |
CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
|
|
1153 |
str);
|
|
1154 |
mutex_enter(&zone_status_lock);
|
|
1155 |
while (zone->zone_status < status) {
|
|
1156 |
CALLB_CPR_SAFE_BEGIN(&cprinfo);
|
|
1157 |
cv_wait(&zone->zone_cv, &zone_status_lock);
|
|
1158 |
CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
|
|
1159 |
}
|
|
1160 |
/*
|
|
1161 |
* zone_status_lock is implicitly released by the following.
|
|
1162 |
*/
|
|
1163 |
CALLB_CPR_EXIT(&cprinfo);
|
|
1164 |
}
|
|
1165 |
|
|
1166 |
/*
|
|
1167 |
* Block until zone enters requested state or signal is received. Return (0)
|
|
1168 |
* if signaled, non-zero otherwise.
|
|
1169 |
*/
|
|
1170 |
int
|
|
1171 |
zone_status_wait_sig(zone_t *zone, zone_status_t status)
|
|
1172 |
{
|
|
1173 |
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
|
|
1174 |
|
|
1175 |
mutex_enter(&zone_status_lock);
|
|
1176 |
while (zone->zone_status < status) {
|
|
1177 |
if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
|
|
1178 |
mutex_exit(&zone_status_lock);
|
|
1179 |
return (0);
|
|
1180 |
}
|
|
1181 |
}
|
|
1182 |
mutex_exit(&zone_status_lock);
|
|
1183 |
return (1);
|
|
1184 |
}
|
|
1185 |
|
|
1186 |
/*
|
|
1187 |
* Block until the zone enters the requested state or the timeout expires,
|
|
1188 |
* whichever happens first. Return (-1) if operation timed out, time remaining
|
|
1189 |
* otherwise.
|
|
1190 |
*/
|
|
1191 |
clock_t
|
|
1192 |
zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
|
|
1193 |
{
|
|
1194 |
clock_t timeleft = 0;
|
|
1195 |
|
|
1196 |
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
|
|
1197 |
|
|
1198 |
mutex_enter(&zone_status_lock);
|
|
1199 |
while (zone->zone_status < status && timeleft != -1) {
|
|
1200 |
timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
|
|
1201 |
}
|
|
1202 |
mutex_exit(&zone_status_lock);
|
|
1203 |
return (timeleft);
|
|
1204 |
}
|
|
1205 |
|
|
1206 |
/*
|
|
1207 |
* Block until the zone enters the requested state, the current process is
|
|
1208 |
* signaled, or the timeout expires, whichever happens first. Return (-1) if
|
|
1209 |
* operation timed out, 0 if signaled, time remaining otherwise.
|
|
1210 |
*/
|
|
1211 |
clock_t
|
|
1212 |
zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
|
|
1213 |
{
|
|
1214 |
clock_t timeleft = tim - lbolt;
|
|
1215 |
|
|
1216 |
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
|
|
1217 |
|
|
1218 |
mutex_enter(&zone_status_lock);
|
|
1219 |
while (zone->zone_status < status) {
|
|
1220 |
timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
|
|
1221 |
tim);
|
|
1222 |
if (timeleft <= 0)
|
|
1223 |
break;
|
|
1224 |
}
|
|
1225 |
mutex_exit(&zone_status_lock);
|
|
1226 |
return (timeleft);
|
|
1227 |
}
|
|
1228 |
|
|
1229 |
/*
|
|
1230 |
* Zones have two reference counts: one for references from credential
|
|
1231 |
* structures (zone_cred_ref), and one (zone_ref) for everything else.
|
|
1232 |
* This is so we can allow a zone to be rebooted while there are still
|
|
1233 |
* outstanding cred references, since certain drivers cache dblks (which
|
|
1234 |
* implicitly results in cached creds). We wait for zone_ref to drop to
|
|
1235 |
* 0 (actually 1), but not zone_cred_ref. The zone structure itself is
|
|
1236 |
* later freed when the zone_cred_ref drops to 0, though nothing other
|
|
1237 |
* than the zone id and privilege set should be accessed once the zone
|
|
1238 |
* is "dead".
|
|
1239 |
*
|
|
1240 |
* A debugging flag, zone_wait_for_cred, can be set to a non-zero value
|
|
1241 |
* to force halt/reboot to block waiting for the zone_cred_ref to drop
|
|
1242 |
* to 0. This can be useful to flush out other sources of cached creds
|
|
1243 |
* that may be less innocuous than the driver case.
|
|
1244 |
*/
|
|
1245 |
|
|
1246 |
int zone_wait_for_cred = 0;
|
|
1247 |
|
|
1248 |
static void
|
|
1249 |
zone_hold_locked(zone_t *z)
|
|
1250 |
{
|
|
1251 |
ASSERT(MUTEX_HELD(&z->zone_lock));
|
|
1252 |
z->zone_ref++;
|
|
1253 |
ASSERT(z->zone_ref != 0);
|
|
1254 |
}
|
|
1255 |
|
|
1256 |
void
|
|
1257 |
zone_hold(zone_t *z)
|
|
1258 |
{
|
|
1259 |
mutex_enter(&z->zone_lock);
|
|
1260 |
zone_hold_locked(z);
|
|
1261 |
mutex_exit(&z->zone_lock);
|
|
1262 |
}
|
|
1263 |
|
|
1264 |
/*
|
|
1265 |
* If the non-cred ref count drops to 1 and either the cred ref count
|
|
1266 |
* is 0 or we aren't waiting for cred references, the zone is ready to
|
|
1267 |
* be destroyed.
|
|
1268 |
*/
|
|
1269 |
#define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \
|
|
1270 |
(!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
|
|
1271 |
|
|
1272 |
void
|
|
1273 |
zone_rele(zone_t *z)
|
|
1274 |
{
|
|
1275 |
boolean_t wakeup;
|
|
1276 |
|
|
1277 |
mutex_enter(&z->zone_lock);
|
|
1278 |
ASSERT(z->zone_ref != 0);
|
|
1279 |
z->zone_ref--;
|
|
1280 |
if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
|
|
1281 |
/* no more refs, free the structure */
|
|
1282 |
mutex_exit(&z->zone_lock);
|
|
1283 |
zone_free(z);
|
|
1284 |
return;
|
|
1285 |
}
|
|
1286 |
/* signal zone_destroy so the zone can finish halting */
|
|
1287 |
wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
|
|
1288 |
mutex_exit(&z->zone_lock);
|
|
1289 |
|
|
1290 |
if (wakeup) {
|
|
1291 |
/*
|
|
1292 |
* Grabbing zonehash_lock here effectively synchronizes with
|
|
1293 |
* zone_destroy() to avoid missed signals.
|
|
1294 |
*/
|
|
1295 |
mutex_enter(&zonehash_lock);
|
|
1296 |
cv_broadcast(&zone_destroy_cv);
|
|
1297 |
mutex_exit(&zonehash_lock);
|
|
1298 |
}
|
|
1299 |
}
|
|
1300 |
|
|
1301 |
void
|
|
1302 |
zone_cred_hold(zone_t *z)
|
|
1303 |
{
|
|
1304 |
mutex_enter(&z->zone_lock);
|
|
1305 |
z->zone_cred_ref++;
|
|
1306 |
ASSERT(z->zone_cred_ref != 0);
|
|
1307 |
mutex_exit(&z->zone_lock);
|
|
1308 |
}
|
|
1309 |
|
|
1310 |
void
|
|
1311 |
zone_cred_rele(zone_t *z)
|
|
1312 |
{
|
|
1313 |
boolean_t wakeup;
|
|
1314 |
|
|
1315 |
mutex_enter(&z->zone_lock);
|
|
1316 |
ASSERT(z->zone_cred_ref != 0);
|
|
1317 |
z->zone_cred_ref--;
|
|
1318 |
if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
|
|
1319 |
/* no more refs, free the structure */
|
|
1320 |
mutex_exit(&z->zone_lock);
|
|
1321 |
zone_free(z);
|
|
1322 |
return;
|
|
1323 |
}
|
|
1324 |
/*
|
|
1325 |
* If zone_destroy is waiting for the cred references to drain
|
|
1326 |
* out, and they have, signal it.
|
|
1327 |
*/
|
|
1328 |
wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
|
|
1329 |
zone_status_get(z) >= ZONE_IS_DEAD);
|
|
1330 |
mutex_exit(&z->zone_lock);
|
|
1331 |
|
|
1332 |
if (wakeup) {
|
|
1333 |
/*
|
|
1334 |
* Grabbing zonehash_lock here effectively synchronizes with
|
|
1335 |
* zone_destroy() to avoid missed signals.
|
|
1336 |
*/
|
|
1337 |
mutex_enter(&zonehash_lock);
|
|
1338 |
cv_broadcast(&zone_destroy_cv);
|
|
1339 |
mutex_exit(&zonehash_lock);
|
|
1340 |
}
|
|
1341 |
}
|
|
1342 |
|
|
1343 |
void
|
|
1344 |
zone_task_hold(zone_t *z)
|
|
1345 |
{
|
|
1346 |
mutex_enter(&z->zone_lock);
|
|
1347 |
z->zone_ntasks++;
|
|
1348 |
ASSERT(z->zone_ntasks != 0);
|
|
1349 |
mutex_exit(&z->zone_lock);
|
|
1350 |
}
|
|
1351 |
|
|
1352 |
void
|
|
1353 |
zone_task_rele(zone_t *zone)
|
|
1354 |
{
|
|
1355 |
uint_t refcnt;
|
|
1356 |
|
|
1357 |
mutex_enter(&zone->zone_lock);
|
|
1358 |
ASSERT(zone->zone_ntasks != 0);
|
|
1359 |
refcnt = --zone->zone_ntasks;
|
|
1360 |
if (refcnt > 1) { /* Common case */
|
|
1361 |
mutex_exit(&zone->zone_lock);
|
|
1362 |
return;
|
|
1363 |
}
|
|
1364 |
zone_hold_locked(zone); /* so we can use the zone_t later */
|
|
1365 |
mutex_exit(&zone->zone_lock);
|
|
1366 |
if (refcnt == 1) {
|
|
1367 |
/*
|
|
1368 |
* See if the zone is shutting down.
|
|
1369 |
*/
|
|
1370 |
mutex_enter(&zone_status_lock);
|
|
1371 |
if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
|
|
1372 |
goto out;
|
|
1373 |
}
|
|
1374 |
|
|
1375 |
/*
|
|
1376 |
* Make sure the ntasks didn't change since we
|
|
1377 |
* dropped zone_lock.
|
|
1378 |
*/
|
|
1379 |
mutex_enter(&zone->zone_lock);
|
|
1380 |
if (refcnt != zone->zone_ntasks) {
|
|
1381 |
mutex_exit(&zone->zone_lock);
|
|
1382 |
goto out;
|
|
1383 |
}
|
|
1384 |
mutex_exit(&zone->zone_lock);
|
|
1385 |
|
|
1386 |
/*
|
|
1387 |
* No more user processes in the zone. The zone is empty.
|
|
1388 |
*/
|
|
1389 |
zone_status_set(zone, ZONE_IS_EMPTY);
|
|
1390 |
goto out;
|
|
1391 |
}
|
|
1392 |
|
|
1393 |
ASSERT(refcnt == 0);
|
|
1394 |
/*
|
|
1395 |
* zsched has exited; the zone is dead.
|
|
1396 |
*/
|
|
1397 |
zone->zone_zsched = NULL; /* paranoia */
|
|
1398 |
mutex_enter(&zone_status_lock);
|
|
1399 |
zone_status_set(zone, ZONE_IS_DEAD);
|
|
1400 |
out:
|
|
1401 |
mutex_exit(&zone_status_lock);
|
|
1402 |
zone_rele(zone);
|
|
1403 |
}
|
|
1404 |
|
|
1405 |
zoneid_t
|
|
1406 |
getzoneid(void)
|
|
1407 |
{
|
|
1408 |
return (curproc->p_zone->zone_id);
|
|
1409 |
}
|
|
1410 |
|
|
1411 |
/*
|
|
1412 |
* Internal versions of zone_find_by_*(). These don't zone_hold() or
|
|
1413 |
* check the validity of a zone's state.
|
|
1414 |
*/
|
|
1415 |
static zone_t *
|
|
1416 |
zone_find_all_by_id(zoneid_t zoneid)
|
|
1417 |
{
|
|
1418 |
mod_hash_val_t hv;
|
|
1419 |
zone_t *zone = NULL;
|
|
1420 |
|
|
1421 |
ASSERT(MUTEX_HELD(&zonehash_lock));
|
|
1422 |
|
|
1423 |
if (mod_hash_find(zonehashbyid,
|
|
1424 |
(mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
|
|
1425 |
zone = (zone_t *)hv;
|
|
1426 |
return (zone);
|
|
1427 |
}
|
|
1428 |
|
|
1429 |
static zone_t *
|
|
1430 |
zone_find_all_by_name(char *name)
|
|
1431 |
{
|
|
1432 |
mod_hash_val_t hv;
|
|
1433 |
zone_t *zone = NULL;
|
|
1434 |
|
|
1435 |
ASSERT(MUTEX_HELD(&zonehash_lock));
|
|
1436 |
|
|
1437 |
if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
|
|
1438 |
zone = (zone_t *)hv;
|
|
1439 |
return (zone);
|
|
1440 |
}
|
|
1441 |
|
|
1442 |
/*
|
|
1443 |
* Public interface for looking up a zone by zoneid. Only returns the zone if
|
|
1444 |
* it is fully initialized, and has not yet begun the zone_destroy() sequence.
|
|
1445 |
* Caller must call zone_rele() once it is done with the zone.
|
|
1446 |
*
|
|
1447 |
* The zone may begin the zone_destroy() sequence immediately after this
|
|
1448 |
* function returns, but may be safely used until zone_rele() is called.
|
|
1449 |
*/
|
|
1450 |
zone_t *
|
|
1451 |
zone_find_by_id(zoneid_t zoneid)
|
|
1452 |
{
|
|
1453 |
zone_t *zone;
|
|
1454 |
zone_status_t status;
|
|
1455 |
|
|
1456 |
mutex_enter(&zonehash_lock);
|
|
1457 |
if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
|
|
1458 |
mutex_exit(&zonehash_lock);
|
|
1459 |
return (NULL);
|
|
1460 |
}
|
|
1461 |
status = zone_status_get(zone);
|
|
1462 |
if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
|
|
1463 |
/*
|
|
1464 |
* For all practical purposes the zone doesn't exist.
|
|
1465 |
*/
|
|
1466 |
mutex_exit(&zonehash_lock);
|
|
1467 |
return (NULL);
|
|
1468 |
}
|
|
1469 |
zone_hold(zone);
|
|
1470 |
mutex_exit(&zonehash_lock);
|
|
1471 |
return (zone);
|
|
1472 |
}
|
|
1473 |
|
|
1474 |
/*
|
|
1475 |
* Similar to zone_find_by_id, but using zone name as the key.
|
|
1476 |
*/
|
|
1477 |
zone_t *
|
|
1478 |
zone_find_by_name(char *name)
|
|
1479 |
{
|
|
1480 |
zone_t *zone;
|
|
1481 |
zone_status_t status;
|
|
1482 |
|
|
1483 |
mutex_enter(&zonehash_lock);
|
|
1484 |
if ((zone = zone_find_all_by_name(name)) == NULL) {
|
|
1485 |
mutex_exit(&zonehash_lock);
|
|
1486 |
return (NULL);
|
|
1487 |
}
|
|
1488 |
status = zone_status_get(zone);
|
|
1489 |
if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
|
|
1490 |
/*
|
|
1491 |
* For all practical purposes the zone doesn't exist.
|
|
1492 |
*/
|
|
1493 |
mutex_exit(&zonehash_lock);
|
|
1494 |
return (NULL);
|
|
1495 |
}
|
|
1496 |
zone_hold(zone);
|
|
1497 |
mutex_exit(&zonehash_lock);
|
|
1498 |
return (zone);
|
|
1499 |
}
|
|
1500 |
|
|
1501 |
/*
|
|
1502 |
* Similar to zone_find_by_id(), using the path as a key. For instance,
|
|
1503 |
* if there is a zone "foo" rooted at /foo/root, and the path argument
|
|
1504 |
* is "/foo/root/proc", it will return the held zone_t corresponding to
|
|
1505 |
* zone "foo".
|
|
1506 |
*
|
|
1507 |
* zone_find_by_path() always returns a non-NULL value, since at the
|
|
1508 |
* very least every path will be contained in the global zone.
|
|
1509 |
*
|
|
1510 |
* As with the other zone_find_by_*() functions, the caller is
|
|
1511 |
* responsible for zone_rele()ing the return value of this function.
|
|
1512 |
*/
|
|
1513 |
zone_t *
|
|
1514 |
zone_find_by_path(const char *path)
|
|
1515 |
{
|
|
1516 |
zone_t *zone;
|
|
1517 |
zone_t *zret = NULL;
|
|
1518 |
zone_status_t status;
|
|
1519 |
|
|
1520 |
if (path == NULL) {
|
|
1521 |
/*
|
|
1522 |
* Call from rootconf().
|
|
1523 |
*/
|
|
1524 |
zone_hold(global_zone);
|
|
1525 |
return (global_zone);
|
|
1526 |
}
|
|
1527 |
ASSERT(*path == '/');
|
|
1528 |
mutex_enter(&zonehash_lock);
|
|
1529 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
1530 |
zone = list_next(&zone_active, zone)) {
|
|
1531 |
if (ZONE_PATH_VISIBLE(path, zone))
|
|
1532 |
zret = zone;
|
|
1533 |
}
|
|
1534 |
ASSERT(zret != NULL);
|
|
1535 |
status = zone_status_get(zret);
|
|
1536 |
if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
|
|
1537 |
/*
|
|
1538 |
* Zone practically doesn't exist.
|
|
1539 |
*/
|
|
1540 |
zret = global_zone;
|
|
1541 |
}
|
|
1542 |
zone_hold(zret);
|
|
1543 |
mutex_exit(&zonehash_lock);
|
|
1544 |
return (zret);
|
|
1545 |
}
|
|
1546 |
|
|
1547 |
/*
|
|
1548 |
* Get the number of cpus visible to this zone. The system-wide global
|
|
1549 |
* 'ncpus' is returned if pools are disabled, the caller is in the
|
|
1550 |
* global zone, or a NULL zone argument is passed in.
|
|
1551 |
*/
|
|
1552 |
int
|
|
1553 |
zone_ncpus_get(zone_t *zone)
|
|
1554 |
{
|
|
1555 |
int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
|
|
1556 |
|
|
1557 |
return (myncpus != 0 ? myncpus : ncpus);
|
|
1558 |
}
|
|
1559 |
|
|
1560 |
/*
|
|
1561 |
* Get the number of online cpus visible to this zone. The system-wide
|
|
1562 |
* global 'ncpus_online' is returned if pools are disabled, the caller
|
|
1563 |
* is in the global zone, or a NULL zone argument is passed in.
|
|
1564 |
*/
|
|
1565 |
int
|
|
1566 |
zone_ncpus_online_get(zone_t *zone)
|
|
1567 |
{
|
|
1568 |
int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
|
|
1569 |
|
|
1570 |
return (myncpus_online != 0 ? myncpus_online : ncpus_online);
|
|
1571 |
}
|
|
1572 |
|
|
1573 |
/*
|
|
1574 |
* Return the pool to which the zone is currently bound.
|
|
1575 |
*/
|
|
1576 |
pool_t *
|
|
1577 |
zone_pool_get(zone_t *zone)
|
|
1578 |
{
|
|
1579 |
ASSERT(pool_lock_held());
|
|
1580 |
|
|
1581 |
return (zone->zone_pool);
|
|
1582 |
}
|
|
1583 |
|
|
1584 |
/*
|
|
1585 |
* Set the zone's pool pointer and update the zone's visibility to match
|
|
1586 |
* the resources in the new pool.
|
|
1587 |
*/
|
|
1588 |
void
|
|
1589 |
zone_pool_set(zone_t *zone, pool_t *pool)
|
|
1590 |
{
|
|
1591 |
ASSERT(pool_lock_held());
|
|
1592 |
ASSERT(MUTEX_HELD(&cpu_lock));
|
|
1593 |
|
|
1594 |
zone->zone_pool = pool;
|
|
1595 |
zone_pset_set(zone, pool->pool_pset->pset_id);
|
|
1596 |
}
|
|
1597 |
|
|
1598 |
/*
|
|
1599 |
* Return the cached value of the id of the processor set to which the
|
|
1600 |
* zone is currently bound. The value will be ZONE_PS_INVAL if the pools
|
|
1601 |
* facility is disabled.
|
|
1602 |
*/
|
|
1603 |
psetid_t
|
|
1604 |
zone_pset_get(zone_t *zone)
|
|
1605 |
{
|
|
1606 |
ASSERT(MUTEX_HELD(&cpu_lock));
|
|
1607 |
|
|
1608 |
return (zone->zone_psetid);
|
|
1609 |
}
|
|
1610 |
|
|
1611 |
/*
|
|
1612 |
* Set the cached value of the id of the processor set to which the zone
|
|
1613 |
* is currently bound. Also update the zone's visibility to match the
|
|
1614 |
* resources in the new processor set.
|
|
1615 |
*/
|
|
1616 |
void
|
|
1617 |
zone_pset_set(zone_t *zone, psetid_t newpsetid)
|
|
1618 |
{
|
|
1619 |
psetid_t oldpsetid;
|
|
1620 |
|
|
1621 |
ASSERT(MUTEX_HELD(&cpu_lock));
|
|
1622 |
oldpsetid = zone_pset_get(zone);
|
|
1623 |
|
|
1624 |
if (oldpsetid == newpsetid)
|
|
1625 |
return;
|
|
1626 |
/*
|
|
1627 |
* Global zone sees all.
|
|
1628 |
*/
|
|
1629 |
if (zone != global_zone) {
|
|
1630 |
zone->zone_psetid = newpsetid;
|
|
1631 |
if (newpsetid != ZONE_PS_INVAL)
|
|
1632 |
pool_pset_visibility_add(newpsetid, zone);
|
|
1633 |
if (oldpsetid != ZONE_PS_INVAL)
|
|
1634 |
pool_pset_visibility_remove(oldpsetid, zone);
|
|
1635 |
}
|
|
1636 |
/*
|
|
1637 |
* Disabling pools, so we should start using the global values
|
|
1638 |
* for ncpus and ncpus_online.
|
|
1639 |
*/
|
|
1640 |
if (newpsetid == ZONE_PS_INVAL) {
|
|
1641 |
zone->zone_ncpus = 0;
|
|
1642 |
zone->zone_ncpus_online = 0;
|
|
1643 |
}
|
|
1644 |
}
|
|
1645 |
|
|
1646 |
/*
|
|
1647 |
* Walk the list of active zones and issue the provided callback for
|
|
1648 |
* each of them.
|
|
1649 |
*
|
|
1650 |
* Caller must not be holding any locks that may be acquired under
|
|
1651 |
* zonehash_lock. See comment at the beginning of the file for a list of
|
|
1652 |
* common locks and their interactions with zones.
|
|
1653 |
*/
|
|
1654 |
int
|
|
1655 |
zone_walk(int (*cb)(zone_t *, void *), void *data)
|
|
1656 |
{
|
|
1657 |
zone_t *zone;
|
|
1658 |
int ret = 0;
|
|
1659 |
zone_status_t status;
|
|
1660 |
|
|
1661 |
mutex_enter(&zonehash_lock);
|
|
1662 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
1663 |
zone = list_next(&zone_active, zone)) {
|
|
1664 |
/*
|
|
1665 |
* Skip zones that shouldn't be externally visible.
|
|
1666 |
*/
|
|
1667 |
status = zone_status_get(zone);
|
|
1668 |
if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
|
|
1669 |
continue;
|
|
1670 |
/*
|
|
1671 |
* Bail immediately if any callback invocation returns a
|
|
1672 |
* non-zero value.
|
|
1673 |
*/
|
|
1674 |
ret = (*cb)(zone, data);
|
|
1675 |
if (ret != 0)
|
|
1676 |
break;
|
|
1677 |
}
|
|
1678 |
mutex_exit(&zonehash_lock);
|
|
1679 |
return (ret);
|
|
1680 |
}
|
|
1681 |
|
|
1682 |
static int
|
|
1683 |
zone_set_root(zone_t *zone, const char *upath)
|
|
1684 |
{
|
|
1685 |
vnode_t *vp;
|
|
1686 |
int trycount;
|
|
1687 |
int error = 0;
|
|
1688 |
char *path;
|
|
1689 |
struct pathname upn, pn;
|
|
1690 |
size_t pathlen;
|
|
1691 |
|
|
1692 |
if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
|
|
1693 |
return (error);
|
|
1694 |
|
|
1695 |
pn_alloc(&pn);
|
|
1696 |
|
|
1697 |
/* prevent infinite loop */
|
|
1698 |
trycount = 10;
|
|
1699 |
for (;;) {
|
|
1700 |
if (--trycount <= 0) {
|
|
1701 |
error = ESTALE;
|
|
1702 |
goto out;
|
|
1703 |
}
|
|
1704 |
|
|
1705 |
if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
|
|
1706 |
/*
|
|
1707 |
* VOP_ACCESS() may cover 'vp' with a new
|
|
1708 |
* filesystem, if 'vp' is an autoFS vnode.
|
|
1709 |
* Get the new 'vp' if so.
|
|
1710 |
*/
|
|
1711 |
if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 &&
|
|
1712 |
(vp->v_vfsmountedhere == NULL ||
|
|
1713 |
(error = traverse(&vp)) == 0)) {
|
|
1714 |
pathlen = pn.pn_pathlen + 2;
|
|
1715 |
path = kmem_alloc(pathlen, KM_SLEEP);
|
|
1716 |
(void) strncpy(path, pn.pn_path,
|
|
1717 |
pn.pn_pathlen + 1);
|
|
1718 |
path[pathlen - 2] = '/';
|
|
1719 |
path[pathlen - 1] = '\0';
|
|
1720 |
pn_free(&pn);
|
|
1721 |
pn_free(&upn);
|
|
1722 |
|
|
1723 |
/* Success! */
|
|
1724 |
break;
|
|
1725 |
}
|
|
1726 |
VN_RELE(vp);
|
|
1727 |
}
|
|
1728 |
if (error != ESTALE)
|
|
1729 |
goto out;
|
|
1730 |
}
|
|
1731 |
|
|
1732 |
ASSERT(error == 0);
|
|
1733 |
zone->zone_rootvp = vp; /* we hold a reference to vp */
|
|
1734 |
zone->zone_rootpath = path;
|
|
1735 |
zone->zone_rootpathlen = pathlen;
|
|
1736 |
return (0);
|
|
1737 |
|
|
1738 |
out:
|
|
1739 |
pn_free(&pn);
|
|
1740 |
pn_free(&upn);
|
|
1741 |
return (error);
|
|
1742 |
}
|
|
1743 |
|
|
1744 |
#define isalnum(c) (((c) >= '0' && (c) <= '9') || \
|
|
1745 |
((c) >= 'a' && (c) <= 'z') || \
|
|
1746 |
((c) >= 'A' && (c) <= 'Z'))
|
|
1747 |
|
|
1748 |
static int
|
|
1749 |
zone_set_name(zone_t *zone, const char *uname)
|
|
1750 |
{
|
|
1751 |
char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
|
|
1752 |
size_t len;
|
|
1753 |
int i, err;
|
|
1754 |
|
|
1755 |
if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
|
|
1756 |
kmem_free(kname, ZONENAME_MAX);
|
|
1757 |
return (err); /* EFAULT or ENAMETOOLONG */
|
|
1758 |
}
|
|
1759 |
|
|
1760 |
/* must be less than ZONENAME_MAX */
|
|
1761 |
if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
|
|
1762 |
kmem_free(kname, ZONENAME_MAX);
|
|
1763 |
return (EINVAL);
|
|
1764 |
}
|
|
1765 |
|
|
1766 |
/*
|
|
1767 |
* Name must start with an alphanumeric and must contain only
|
|
1768 |
* alphanumerics, '-', '_' and '.'.
|
|
1769 |
*/
|
|
1770 |
if (!isalnum(kname[0])) {
|
|
1771 |
kmem_free(kname, ZONENAME_MAX);
|
|
1772 |
return (EINVAL);
|
|
1773 |
}
|
|
1774 |
for (i = 1; i < len - 1; i++) {
|
|
1775 |
if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
|
|
1776 |
kname[i] != '.') {
|
|
1777 |
kmem_free(kname, ZONENAME_MAX);
|
|
1778 |
return (EINVAL);
|
|
1779 |
}
|
|
1780 |
}
|
|
1781 |
|
|
1782 |
zone->zone_name = kname;
|
|
1783 |
return (0);
|
|
1784 |
}
|
|
1785 |
|
|
1786 |
/*
|
|
1787 |
* Similar to thread_create(), but makes sure the thread is in the appropriate
|
|
1788 |
* zone's zsched process (curproc->p_zone->zone_zsched) before returning.
|
|
1789 |
*/
|
|
1790 |
/*ARGSUSED*/
|
|
1791 |
kthread_t *
|
|
1792 |
zthread_create(
|
|
1793 |
caddr_t stk,
|
|
1794 |
size_t stksize,
|
|
1795 |
void (*proc)(),
|
|
1796 |
void *arg,
|
|
1797 |
size_t len,
|
|
1798 |
pri_t pri)
|
|
1799 |
{
|
|
1800 |
kthread_t *t;
|
|
1801 |
zone_t *zone = curproc->p_zone;
|
|
1802 |
proc_t *pp = zone->zone_zsched;
|
|
1803 |
|
|
1804 |
zone_hold(zone); /* Reference to be dropped when thread exits */
|
|
1805 |
|
|
1806 |
/*
|
|
1807 |
* No-one should be trying to create threads if the zone is shutting
|
|
1808 |
* down and there aren't any kernel threads around. See comment
|
|
1809 |
* in zthread_exit().
|
|
1810 |
*/
|
|
1811 |
ASSERT(!(zone->zone_kthreads == NULL &&
|
|
1812 |
zone_status_get(zone) >= ZONE_IS_EMPTY));
|
|
1813 |
/*
|
|
1814 |
* Create a thread, but don't let it run until we've finished setting
|
|
1815 |
* things up.
|
|
1816 |
*/
|
|
1817 |
t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
|
|
1818 |
ASSERT(t->t_forw == NULL);
|
|
1819 |
mutex_enter(&zone_status_lock);
|
|
1820 |
if (zone->zone_kthreads == NULL) {
|
|
1821 |
t->t_forw = t->t_back = t;
|
|
1822 |
} else {
|
|
1823 |
kthread_t *tx = zone->zone_kthreads;
|
|
1824 |
|
|
1825 |
t->t_forw = tx;
|
|
1826 |
t->t_back = tx->t_back;
|
|
1827 |
tx->t_back->t_forw = t;
|
|
1828 |
tx->t_back = t;
|
|
1829 |
}
|
|
1830 |
zone->zone_kthreads = t;
|
|
1831 |
mutex_exit(&zone_status_lock);
|
|
1832 |
|
|
1833 |
mutex_enter(&pp->p_lock);
|
|
1834 |
t->t_proc_flag |= TP_ZTHREAD;
|
|
1835 |
project_rele(t->t_proj);
|
|
1836 |
t->t_proj = project_hold(pp->p_task->tk_proj);
|
|
1837 |
|
|
1838 |
/*
|
|
1839 |
* Setup complete, let it run.
|
|
1840 |
*/
|
|
1841 |
thread_lock(t);
|
|
1842 |
t->t_schedflag |= TS_ALLSTART;
|
|
1843 |
setrun_locked(t);
|
|
1844 |
thread_unlock(t);
|
|
1845 |
|
|
1846 |
mutex_exit(&pp->p_lock);
|
|
1847 |
|
|
1848 |
return (t);
|
|
1849 |
}
|
|
1850 |
|
|
1851 |
/*
|
|
1852 |
* Similar to thread_exit(). Must be called by threads created via
|
|
1853 |
* zthread_exit().
|
|
1854 |
*/
|
|
1855 |
void
|
|
1856 |
zthread_exit(void)
|
|
1857 |
{
|
|
1858 |
kthread_t *t = curthread;
|
|
1859 |
proc_t *pp = curproc;
|
|
1860 |
zone_t *zone = pp->p_zone;
|
|
1861 |
|
|
1862 |
mutex_enter(&zone_status_lock);
|
|
1863 |
|
|
1864 |
/*
|
|
1865 |
* Reparent to p0
|
|
1866 |
*/
|
|
1867 |
mutex_enter(&pp->p_lock);
|
|
1868 |
t->t_proc_flag &= ~TP_ZTHREAD;
|
|
1869 |
t->t_procp = &p0;
|
|
1870 |
hat_thread_exit(t);
|
|
1871 |
mutex_exit(&pp->p_lock);
|
|
1872 |
|
|
1873 |
if (t->t_back == t) {
|
|
1874 |
ASSERT(t->t_forw == t);
|
|
1875 |
/*
|
|
1876 |
* If the zone is empty, once the thread count
|
|
1877 |
* goes to zero no further kernel threads can be
|
|
1878 |
* created. This is because if the creator is a process
|
|
1879 |
* in the zone, then it must have exited before the zone
|
|
1880 |
* state could be set to ZONE_IS_EMPTY.
|
|
1881 |
* Otherwise, if the creator is a kernel thread in the
|
|
1882 |
* zone, the thread count is non-zero.
|
|
1883 |
*
|
|
1884 |
* This really means that non-zone kernel threads should
|
|
1885 |
* not create zone kernel threads.
|
|
1886 |
*/
|
|
1887 |
zone->zone_kthreads = NULL;
|
|
1888 |
if (zone_status_get(zone) == ZONE_IS_EMPTY) {
|
|
1889 |
zone_status_set(zone, ZONE_IS_DOWN);
|
|
1890 |
}
|
|
1891 |
} else {
|
|
1892 |
t->t_forw->t_back = t->t_back;
|
|
1893 |
t->t_back->t_forw = t->t_forw;
|
|
1894 |
if (zone->zone_kthreads == t)
|
|
1895 |
zone->zone_kthreads = t->t_forw;
|
|
1896 |
}
|
|
1897 |
mutex_exit(&zone_status_lock);
|
|
1898 |
zone_rele(zone);
|
|
1899 |
thread_exit();
|
|
1900 |
/* NOTREACHED */
|
|
1901 |
}
|
|
1902 |
|
|
1903 |
static void
|
|
1904 |
zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
|
|
1905 |
{
|
|
1906 |
vnode_t *oldvp;
|
|
1907 |
|
|
1908 |
/* we're going to hold a reference here to the directory */
|
|
1909 |
VN_HOLD(vp);
|
|
1910 |
|
|
1911 |
#ifdef C2_AUDIT
|
|
1912 |
if (audit_active) /* update abs cwd/root path see c2audit.c */
|
|
1913 |
audit_chdirec(vp, vpp);
|
|
1914 |
#endif
|
|
1915 |
|
|
1916 |
mutex_enter(&pp->p_lock);
|
|
1917 |
oldvp = *vpp;
|
|
1918 |
*vpp = vp;
|
|
1919 |
mutex_exit(&pp->p_lock);
|
|
1920 |
if (oldvp != NULL)
|
|
1921 |
VN_RELE(oldvp);
|
|
1922 |
}
|
|
1923 |
|
|
1924 |
/*
|
|
1925 |
* Convert an rctl value represented by an nvlist_t into an rctl_val_t.
|
|
1926 |
*/
|
|
1927 |
static int
|
|
1928 |
nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
|
|
1929 |
{
|
|
1930 |
nvpair_t *nvp = NULL;
|
|
1931 |
boolean_t priv_set = B_FALSE;
|
|
1932 |
boolean_t limit_set = B_FALSE;
|
|
1933 |
boolean_t action_set = B_FALSE;
|
|
1934 |
|
|
1935 |
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
|
|
1936 |
const char *name;
|
|
1937 |
uint64_t ui64;
|
|
1938 |
|
|
1939 |
name = nvpair_name(nvp);
|
|
1940 |
if (nvpair_type(nvp) != DATA_TYPE_UINT64)
|
|
1941 |
return (EINVAL);
|
|
1942 |
(void) nvpair_value_uint64(nvp, &ui64);
|
|
1943 |
if (strcmp(name, "privilege") == 0) {
|
|
1944 |
/*
|
|
1945 |
* Currently only privileged values are allowed, but
|
|
1946 |
* this may change in the future.
|
|
1947 |
*/
|
|
1948 |
if (ui64 != RCPRIV_PRIVILEGED)
|
|
1949 |
return (EINVAL);
|
|
1950 |
rv->rcv_privilege = ui64;
|
|
1951 |
priv_set = B_TRUE;
|
|
1952 |
} else if (strcmp(name, "limit") == 0) {
|
|
1953 |
rv->rcv_value = ui64;
|
|
1954 |
limit_set = B_TRUE;
|
|
1955 |
} else if (strcmp(name, "action") == 0) {
|
|
1956 |
if (ui64 != RCTL_LOCAL_NOACTION &&
|
|
1957 |
ui64 != RCTL_LOCAL_DENY)
|
|
1958 |
return (EINVAL);
|
|
1959 |
rv->rcv_flagaction = ui64;
|
|
1960 |
action_set = B_TRUE;
|
|
1961 |
} else {
|
|
1962 |
return (EINVAL);
|
|
1963 |
}
|
|
1964 |
}
|
|
1965 |
|
|
1966 |
if (!(priv_set && limit_set && action_set))
|
|
1967 |
return (EINVAL);
|
|
1968 |
rv->rcv_action_signal = 0;
|
|
1969 |
rv->rcv_action_recipient = NULL;
|
|
1970 |
rv->rcv_action_recip_pid = -1;
|
|
1971 |
rv->rcv_firing_time = 0;
|
|
1972 |
|
|
1973 |
return (0);
|
|
1974 |
}
|
|
1975 |
|
|
1976 |
void
|
|
1977 |
zone_icode(void)
|
|
1978 |
{
|
|
1979 |
proc_t *p = ttoproc(curthread);
|
|
1980 |
struct core_globals *cg;
|
|
1981 |
|
|
1982 |
/*
|
|
1983 |
* For all purposes (ZONE_ATTR_INITPID and restart_init),
|
|
1984 |
* storing just the pid of init is sufficient.
|
|
1985 |
*/
|
|
1986 |
p->p_zone->zone_proc_initpid = p->p_pid;
|
|
1987 |
|
|
1988 |
/*
|
|
1989 |
* Allocate user address space and stack segment
|
|
1990 |
*/
|
|
1991 |
|
|
1992 |
p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
|
|
1993 |
p->p_usrstack = (caddr_t)USRSTACK32;
|
|
1994 |
p->p_model = DATAMODEL_ILP32;
|
|
1995 |
p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
|
|
1996 |
p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
|
|
1997 |
p->p_stk_ctl = INT32_MAX;
|
|
1998 |
|
|
1999 |
p->p_as = as_alloc();
|
|
2000 |
p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
|
|
2001 |
(void) hat_setup(p->p_as->a_hat, HAT_INIT);
|
|
2002 |
|
|
2003 |
cg = zone_getspecific(core_zone_key, p->p_zone);
|
|
2004 |
ASSERT(cg != NULL);
|
|
2005 |
corectl_path_hold(cg->core_default_path);
|
|
2006 |
corectl_content_hold(cg->core_default_content);
|
|
2007 |
p->p_corefile = cg->core_default_path;
|
|
2008 |
p->p_content = cg->core_default_content;
|
|
2009 |
|
|
2010 |
init_mstate(curthread, LMS_SYSTEM);
|
|
2011 |
|
|
2012 |
p->p_zone->zone_boot_err = exec_init(zone_initname, 0,
|
|
2013 |
p->p_zone->zone_bootargs);
|
|
2014 |
|
|
2015 |
mutex_enter(&zone_status_lock);
|
|
2016 |
if (p->p_zone->zone_boot_err != 0) {
|
|
2017 |
/*
|
|
2018 |
* Make sure we are still in the booting state-- we could have
|
|
2019 |
* raced and already be shutting down, or even further along.
|
|
2020 |
*/
|
|
2021 |
if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING)
|
|
2022 |
zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN);
|
|
2023 |
mutex_exit(&zone_status_lock);
|
|
2024 |
/* It's gone bad, dispose of the process */
|
|
2025 |
if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) {
|
390
|
2026 |
mutex_enter(&p->p_lock);
|
|
2027 |
ASSERT(p->p_flag & SEXITLWPS);
|
0
|
2028 |
lwp_exit();
|
|
2029 |
}
|
|
2030 |
} else {
|
|
2031 |
if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING)
|
|
2032 |
zone_status_set(p->p_zone, ZONE_IS_RUNNING);
|
|
2033 |
mutex_exit(&zone_status_lock);
|
|
2034 |
/* cause the process to return to userland. */
|
|
2035 |
lwp_rtt();
|
|
2036 |
}
|
|
2037 |
}
|
|
2038 |
|
|
2039 |
struct zsched_arg {
|
|
2040 |
zone_t *zone;
|
|
2041 |
nvlist_t *nvlist;
|
|
2042 |
};
|
|
2043 |
|
|
2044 |
/*
|
|
2045 |
* Per-zone "sched" workalike. The similarity to "sched" doesn't have
|
|
2046 |
* anything to do with scheduling, but rather with the fact that
|
|
2047 |
* per-zone kernel threads are parented to zsched, just like regular
|
|
2048 |
* kernel threads are parented to sched (p0).
|
|
2049 |
*
|
|
2050 |
* zsched is also responsible for launching init for the zone.
|
|
2051 |
*/
|
|
2052 |
static void
|
|
2053 |
zsched(void *arg)
|
|
2054 |
{
|
|
2055 |
struct zsched_arg *za = arg;
|
|
2056 |
proc_t *pp = curproc;
|
|
2057 |
proc_t *initp = proc_init;
|
|
2058 |
zone_t *zone = za->zone;
|
|
2059 |
cred_t *cr, *oldcred;
|
|
2060 |
rctl_set_t *set;
|
|
2061 |
rctl_alloc_gp_t *gp;
|
|
2062 |
contract_t *ct = NULL;
|
|
2063 |
task_t *tk, *oldtk;
|
|
2064 |
rctl_entity_p_t e;
|
|
2065 |
kproject_t *pj;
|
|
2066 |
|
|
2067 |
nvlist_t *nvl = za->nvlist;
|
|
2068 |
nvpair_t *nvp = NULL;
|
|
2069 |
|
|
2070 |
bcopy("zsched", u.u_psargs, sizeof ("zsched"));
|
|
2071 |
bcopy("zsched", u.u_comm, sizeof ("zsched"));
|
|
2072 |
u.u_argc = 0;
|
|
2073 |
u.u_argv = NULL;
|
|
2074 |
u.u_envp = NULL;
|
|
2075 |
closeall(P_FINFO(pp));
|
|
2076 |
|
|
2077 |
/*
|
|
2078 |
* We are this zone's "zsched" process. As the zone isn't generally
|
|
2079 |
* visible yet we don't need to grab any locks before initializing its
|
|
2080 |
* zone_proc pointer.
|
|
2081 |
*/
|
|
2082 |
zone_hold(zone); /* this hold is released by zone_destroy() */
|
|
2083 |
zone->zone_zsched = pp;
|
|
2084 |
mutex_enter(&pp->p_lock);
|
|
2085 |
pp->p_zone = zone;
|
|
2086 |
mutex_exit(&pp->p_lock);
|
|
2087 |
|
|
2088 |
/*
|
|
2089 |
* Disassociate process from its 'parent'; parent ourselves to init
|
|
2090 |
* (pid 1) and change other values as needed.
|
|
2091 |
*/
|
|
2092 |
sess_create();
|
|
2093 |
|
|
2094 |
mutex_enter(&pidlock);
|
|
2095 |
proc_detach(pp);
|
|
2096 |
pp->p_ppid = 1;
|
|
2097 |
pp->p_flag |= SZONETOP;
|
|
2098 |
pp->p_ancpid = 1;
|
|
2099 |
pp->p_parent = initp;
|
|
2100 |
pp->p_psibling = NULL;
|
|
2101 |
if (initp->p_child)
|
|
2102 |
initp->p_child->p_psibling = pp;
|
|
2103 |
pp->p_sibling = initp->p_child;
|
|
2104 |
initp->p_child = pp;
|
|
2105 |
|
|
2106 |
/* Decrement what newproc() incremented. */
|
|
2107 |
upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
|
|
2108 |
/*
|
|
2109 |
* Our credentials are about to become kcred-like, so we don't care
|
|
2110 |
* about the caller's ruid.
|
|
2111 |
*/
|
|
2112 |
upcount_inc(crgetruid(kcred), zone->zone_id);
|
|
2113 |
mutex_exit(&pidlock);
|
|
2114 |
|
|
2115 |
/*
|
|
2116 |
* getting out of global zone, so decrement lwp counts
|
|
2117 |
*/
|
|
2118 |
pj = pp->p_task->tk_proj;
|
|
2119 |
mutex_enter(&global_zone->zone_nlwps_lock);
|
|
2120 |
pj->kpj_nlwps -= pp->p_lwpcnt;
|
|
2121 |
global_zone->zone_nlwps -= pp->p_lwpcnt;
|
|
2122 |
mutex_exit(&global_zone->zone_nlwps_lock);
|
|
2123 |
|
|
2124 |
/*
|
|
2125 |
* Create and join a new task in project '0' of this zone.
|
|
2126 |
*
|
|
2127 |
* We don't need to call holdlwps() since we know we're the only lwp in
|
|
2128 |
* this process.
|
|
2129 |
*
|
|
2130 |
* task_join() returns with p_lock held.
|
|
2131 |
*/
|
|
2132 |
tk = task_create(0, zone);
|
|
2133 |
mutex_enter(&cpu_lock);
|
|
2134 |
oldtk = task_join(tk, 0);
|
|
2135 |
mutex_exit(&curproc->p_lock);
|
|
2136 |
mutex_exit(&cpu_lock);
|
|
2137 |
task_rele(oldtk);
|
|
2138 |
|
|
2139 |
/*
|
|
2140 |
* add lwp counts to zsched's zone, and increment project's task count
|
|
2141 |
* due to the task created in the above tasksys_settaskid
|
|
2142 |
*/
|
|
2143 |
pj = pp->p_task->tk_proj;
|
|
2144 |
mutex_enter(&zone->zone_nlwps_lock);
|
|
2145 |
pj->kpj_nlwps += pp->p_lwpcnt;
|
|
2146 |
pj->kpj_ntasks += 1;
|
|
2147 |
zone->zone_nlwps += pp->p_lwpcnt;
|
|
2148 |
mutex_exit(&zone->zone_nlwps_lock);
|
|
2149 |
|
|
2150 |
/*
|
|
2151 |
* The process was created by a process in the global zone, hence the
|
|
2152 |
* credentials are wrong. We might as well have kcred-ish credentials.
|
|
2153 |
*/
|
|
2154 |
cr = zone->zone_kcred;
|
|
2155 |
crhold(cr);
|
|
2156 |
mutex_enter(&pp->p_crlock);
|
|
2157 |
oldcred = pp->p_cred;
|
|
2158 |
pp->p_cred = cr;
|
|
2159 |
mutex_exit(&pp->p_crlock);
|
|
2160 |
crfree(oldcred);
|
|
2161 |
|
|
2162 |
/*
|
|
2163 |
* Hold credentials again (for thread)
|
|
2164 |
*/
|
|
2165 |
crhold(cr);
|
|
2166 |
|
|
2167 |
/*
|
|
2168 |
* p_lwpcnt can't change since this is a kernel process.
|
|
2169 |
*/
|
|
2170 |
crset(pp, cr);
|
|
2171 |
|
|
2172 |
/*
|
|
2173 |
* Chroot
|
|
2174 |
*/
|
|
2175 |
zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
|
|
2176 |
zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
|
|
2177 |
|
|
2178 |
/*
|
|
2179 |
* Initialize zone's rctl set.
|
|
2180 |
*/
|
|
2181 |
set = rctl_set_create();
|
|
2182 |
gp = rctl_set_init_prealloc(RCENTITY_ZONE);
|
|
2183 |
mutex_enter(&pp->p_lock);
|
|
2184 |
e.rcep_p.zone = zone;
|
|
2185 |
e.rcep_t = RCENTITY_ZONE;
|
|
2186 |
zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
|
|
2187 |
mutex_exit(&pp->p_lock);
|
|
2188 |
rctl_prealloc_destroy(gp);
|
|
2189 |
|
|
2190 |
/*
|
|
2191 |
* Apply the rctls passed in to zone_create(). This is basically a list
|
|
2192 |
* assignment: all of the old values are removed and the new ones
|
|
2193 |
* inserted. That is, if an empty list is passed in, all values are
|
|
2194 |
* removed.
|
|
2195 |
*/
|
|
2196 |
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
|
|
2197 |
rctl_dict_entry_t *rde;
|
|
2198 |
rctl_hndl_t hndl;
|
|
2199 |
char *name;
|
|
2200 |
nvlist_t **nvlarray;
|
|
2201 |
uint_t i, nelem;
|
|
2202 |
int error; /* For ASSERT()s */
|
|
2203 |
|
|
2204 |
name = nvpair_name(nvp);
|
|
2205 |
hndl = rctl_hndl_lookup(name);
|
|
2206 |
ASSERT(hndl != -1);
|
|
2207 |
rde = rctl_dict_lookup_hndl(hndl);
|
|
2208 |
ASSERT(rde != NULL);
|
|
2209 |
|
|
2210 |
for (; /* ever */; ) {
|
|
2211 |
rctl_val_t oval;
|
|
2212 |
|
|
2213 |
mutex_enter(&pp->p_lock);
|
|
2214 |
error = rctl_local_get(hndl, NULL, &oval, pp);
|
|
2215 |
mutex_exit(&pp->p_lock);
|
|
2216 |
ASSERT(error == 0); /* Can't fail for RCTL_FIRST */
|
|
2217 |
ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
|
|
2218 |
if (oval.rcv_privilege == RCPRIV_SYSTEM)
|
|
2219 |
break;
|
|
2220 |
mutex_enter(&pp->p_lock);
|
|
2221 |
error = rctl_local_delete(hndl, &oval, pp);
|
|
2222 |
mutex_exit(&pp->p_lock);
|
|
2223 |
ASSERT(error == 0);
|
|
2224 |
}
|
|
2225 |
error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
|
|
2226 |
ASSERT(error == 0);
|
|
2227 |
for (i = 0; i < nelem; i++) {
|
|
2228 |
rctl_val_t *nvalp;
|
|
2229 |
|
|
2230 |
nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
|
|
2231 |
error = nvlist2rctlval(nvlarray[i], nvalp);
|
|
2232 |
ASSERT(error == 0);
|
|
2233 |
/*
|
|
2234 |
* rctl_local_insert can fail if the value being
|
|
2235 |
* inserted is a duplicate; this is OK.
|
|
2236 |
*/
|
|
2237 |
mutex_enter(&pp->p_lock);
|
|
2238 |
if (rctl_local_insert(hndl, nvalp, pp) != 0)
|
|
2239 |
kmem_cache_free(rctl_val_cache, nvalp);
|
|
2240 |
mutex_exit(&pp->p_lock);
|
|
2241 |
}
|
|
2242 |
}
|
|
2243 |
/*
|
|
2244 |
* Tell the world that we're done setting up.
|
|
2245 |
*
|
|
2246 |
* At this point we want to set the zone status to ZONE_IS_READY
|
|
2247 |
* and atomically set the zone's processor set visibility. Once
|
|
2248 |
* we drop pool_lock() this zone will automatically get updated
|
|
2249 |
* to reflect any future changes to the pools configuration.
|
|
2250 |
*/
|
|
2251 |
pool_lock();
|
|
2252 |
mutex_enter(&cpu_lock);
|
|
2253 |
mutex_enter(&zonehash_lock);
|
|
2254 |
zone_uniqid(zone);
|
|
2255 |
zone_zsd_configure(zone);
|
|
2256 |
if (pool_state == POOL_ENABLED)
|
|
2257 |
zone_pset_set(zone, pool_default->pool_pset->pset_id);
|
|
2258 |
mutex_enter(&zone_status_lock);
|
|
2259 |
ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
|
|
2260 |
zone_status_set(zone, ZONE_IS_READY);
|
|
2261 |
mutex_exit(&zone_status_lock);
|
|
2262 |
mutex_exit(&zonehash_lock);
|
|
2263 |
mutex_exit(&cpu_lock);
|
|
2264 |
pool_unlock();
|
|
2265 |
|
|
2266 |
/*
|
|
2267 |
* Once we see the zone transition to the ZONE_IS_BOOTING state,
|
|
2268 |
* we launch init, and set the state to running.
|
|
2269 |
*/
|
|
2270 |
zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
|
|
2271 |
|
|
2272 |
if (zone_status_get(zone) == ZONE_IS_BOOTING) {
|
|
2273 |
id_t cid;
|
|
2274 |
|
|
2275 |
/*
|
|
2276 |
* Ok, this is a little complicated. We need to grab the
|
|
2277 |
* zone's pool's scheduling class ID; note that by now, we
|
|
2278 |
* are already bound to a pool if we need to be (zoneadmd
|
|
2279 |
* will have done that to us while we're in the READY
|
|
2280 |
* state). *But* the scheduling class for the zone's 'init'
|
|
2281 |
* must be explicitly passed to newproc, which doesn't
|
|
2282 |
* respect pool bindings.
|
|
2283 |
*
|
|
2284 |
* We hold the pool_lock across the call to newproc() to
|
|
2285 |
* close the obvious race: the pool's scheduling class
|
|
2286 |
* could change before we manage to create the LWP with
|
|
2287 |
* classid 'cid'.
|
|
2288 |
*/
|
|
2289 |
pool_lock();
|
|
2290 |
cid = pool_get_class(zone->zone_pool);
|
|
2291 |
if (cid == -1)
|
|
2292 |
cid = defaultcid;
|
|
2293 |
|
|
2294 |
/*
|
|
2295 |
* If this fails, zone_boot will ultimately fail. The
|
|
2296 |
* state of the zone will be set to SHUTTING_DOWN-- userland
|
|
2297 |
* will have to tear down the zone, and fail, or try again.
|
|
2298 |
*/
|
|
2299 |
if ((zone->zone_boot_err = newproc(zone_icode, NULL, cid,
|
|
2300 |
minclsyspri - 1, &ct)) != 0) {
|
|
2301 |
mutex_enter(&zone_status_lock);
|
|
2302 |
zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
|
|
2303 |
mutex_exit(&zone_status_lock);
|
|
2304 |
}
|
|
2305 |
pool_unlock();
|
|
2306 |
}
|
|
2307 |
|
|
2308 |
/*
|
|
2309 |
* Wait for zone_destroy() to be called. This is what we spend
|
|
2310 |
* most of our life doing.
|
|
2311 |
*/
|
|
2312 |
zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
|
|
2313 |
|
|
2314 |
if (ct)
|
|
2315 |
/*
|
|
2316 |
* At this point the process contract should be empty.
|
|
2317 |
* (Though if it isn't, it's not the end of the world.)
|
|
2318 |
*/
|
|
2319 |
VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
|
|
2320 |
|
|
2321 |
/*
|
|
2322 |
* Allow kcred to be freed when all referring processes
|
|
2323 |
* (including this one) go away. We can't just do this in
|
|
2324 |
* zone_free because we need to wait for the zone_cred_ref to
|
|
2325 |
* drop to 0 before calling zone_free, and the existence of
|
|
2326 |
* zone_kcred will prevent that. Thus, we call crfree here to
|
|
2327 |
* balance the crdup in zone_create. The crhold calls earlier
|
|
2328 |
* in zsched will be dropped when the thread and process exit.
|
|
2329 |
*/
|
|
2330 |
crfree(zone->zone_kcred);
|
|
2331 |
zone->zone_kcred = NULL;
|
|
2332 |
|
|
2333 |
exit(CLD_EXITED, 0);
|
|
2334 |
}
|
|
2335 |
|
|
2336 |
/*
|
|
2337 |
* Helper function to determine if there are any submounts of the
|
|
2338 |
* provided path. Used to make sure the zone doesn't "inherit" any
|
|
2339 |
* mounts from before it is created.
|
|
2340 |
*/
|
|
2341 |
static uint_t
|
|
2342 |
zone_mount_count(const char *rootpath)
|
|
2343 |
{
|
|
2344 |
vfs_t *vfsp;
|
|
2345 |
uint_t count = 0;
|
|
2346 |
size_t rootpathlen = strlen(rootpath);
|
|
2347 |
|
|
2348 |
/*
|
|
2349 |
* Holding zonehash_lock prevents race conditions with
|
|
2350 |
* vfs_list_add()/vfs_list_remove() since we serialize with
|
|
2351 |
* zone_find_by_path().
|
|
2352 |
*/
|
|
2353 |
ASSERT(MUTEX_HELD(&zonehash_lock));
|
|
2354 |
/*
|
|
2355 |
* The rootpath must end with a '/'
|
|
2356 |
*/
|
|
2357 |
ASSERT(rootpath[rootpathlen - 1] == '/');
|
|
2358 |
|
|
2359 |
/*
|
|
2360 |
* This intentionally does not count the rootpath itself if that
|
|
2361 |
* happens to be a mount point.
|
|
2362 |
*/
|
|
2363 |
vfs_list_read_lock();
|
|
2364 |
vfsp = rootvfs;
|
|
2365 |
do {
|
|
2366 |
if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
|
|
2367 |
rootpathlen) == 0)
|
|
2368 |
count++;
|
|
2369 |
vfsp = vfsp->vfs_next;
|
|
2370 |
} while (vfsp != rootvfs);
|
|
2371 |
vfs_list_unlock();
|
|
2372 |
return (count);
|
|
2373 |
}
|
|
2374 |
|
|
2375 |
/*
|
|
2376 |
* Helper function to make sure that a zone created on 'rootpath'
|
|
2377 |
* wouldn't end up containing other zones' rootpaths.
|
|
2378 |
*/
|
|
2379 |
static boolean_t
|
|
2380 |
zone_is_nested(const char *rootpath)
|
|
2381 |
{
|
|
2382 |
zone_t *zone;
|
|
2383 |
size_t rootpathlen = strlen(rootpath);
|
|
2384 |
size_t len;
|
|
2385 |
|
|
2386 |
ASSERT(MUTEX_HELD(&zonehash_lock));
|
|
2387 |
|
|
2388 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
2389 |
zone = list_next(&zone_active, zone)) {
|
|
2390 |
if (zone == global_zone)
|
|
2391 |
continue;
|
|
2392 |
len = strlen(zone->zone_rootpath);
|
|
2393 |
if (strncmp(rootpath, zone->zone_rootpath,
|
|
2394 |
MIN(rootpathlen, len)) == 0)
|
|
2395 |
return (B_TRUE);
|
|
2396 |
}
|
|
2397 |
return (B_FALSE);
|
|
2398 |
}
|
|
2399 |
|
|
2400 |
static int
|
|
2401 |
zone_set_privset(zone_t *zone, const priv_set_t *zone_privs)
|
|
2402 |
{
|
|
2403 |
priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
|
|
2404 |
|
|
2405 |
if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
|
|
2406 |
kmem_free(privs, sizeof (priv_set_t));
|
|
2407 |
return (EFAULT);
|
|
2408 |
}
|
|
2409 |
|
|
2410 |
zone->zone_privset = privs;
|
|
2411 |
return (0);
|
|
2412 |
}
|
|
2413 |
|
|
2414 |
/*
|
|
2415 |
* We make creative use of nvlists to pass in rctls from userland. The list is
|
|
2416 |
* a list of the following structures:
|
|
2417 |
*
|
|
2418 |
* (name = rctl_name, value = nvpair_list_array)
|
|
2419 |
*
|
|
2420 |
* Where each element of the nvpair_list_array is of the form:
|
|
2421 |
*
|
|
2422 |
* [(name = "privilege", value = RCPRIV_PRIVILEGED),
|
|
2423 |
* (name = "limit", value = uint64_t),
|
|
2424 |
* (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
|
|
2425 |
*/
|
|
2426 |
static int
|
|
2427 |
parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
|
|
2428 |
{
|
|
2429 |
nvpair_t *nvp = NULL;
|
|
2430 |
nvlist_t *nvl = NULL;
|
|
2431 |
char *kbuf;
|
|
2432 |
int error;
|
|
2433 |
rctl_val_t rv;
|
|
2434 |
|
|
2435 |
*nvlp = NULL;
|
|
2436 |
|
|
2437 |
if (buflen == 0)
|
|
2438 |
return (0);
|
|
2439 |
|
|
2440 |
if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
|
|
2441 |
return (ENOMEM);
|
|
2442 |
if (copyin(ubuf, kbuf, buflen)) {
|
|
2443 |
error = EFAULT;
|
|
2444 |
goto out;
|
|
2445 |
}
|
|
2446 |
if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
|
|
2447 |
/*
|
|
2448 |
* nvl may have been allocated/free'd, but the value set to
|
|
2449 |
* non-NULL, so we reset it here.
|
|
2450 |
*/
|
|
2451 |
nvl = NULL;
|
|
2452 |
error = EINVAL;
|
|
2453 |
goto out;
|
|
2454 |
}
|
|
2455 |
while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
|
|
2456 |
rctl_dict_entry_t *rde;
|
|
2457 |
rctl_hndl_t hndl;
|
|
2458 |
nvlist_t **nvlarray;
|
|
2459 |
uint_t i, nelem;
|
|
2460 |
char *name;
|
|
2461 |
|
|
2462 |
error = EINVAL;
|
|
2463 |
name = nvpair_name(nvp);
|
|
2464 |
if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
|
|
2465 |
!= 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
|
|
2466 |
goto out;
|
|
2467 |
}
|
|
2468 |
if ((hndl = rctl_hndl_lookup(name)) == -1) {
|
|
2469 |
goto out;
|
|
2470 |
}
|
|
2471 |
rde = rctl_dict_lookup_hndl(hndl);
|
|
2472 |
error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
|
|
2473 |
ASSERT(error == 0);
|
|
2474 |
for (i = 0; i < nelem; i++) {
|
|
2475 |
if (error = nvlist2rctlval(nvlarray[i], &rv))
|
|
2476 |
goto out;
|
|
2477 |
}
|
|
2478 |
if (rctl_invalid_value(rde, &rv)) {
|
|
2479 |
error = EINVAL;
|
|
2480 |
goto out;
|
|
2481 |
}
|
|
2482 |
}
|
|
2483 |
error = 0;
|
|
2484 |
*nvlp = nvl;
|
|
2485 |
out:
|
|
2486 |
kmem_free(kbuf, buflen);
|
|
2487 |
if (error && nvl != NULL)
|
|
2488 |
nvlist_free(nvl);
|
|
2489 |
return (error);
|
|
2490 |
}
|
|
2491 |
|
|
2492 |
int
|
|
2493 |
zone_create_error(int er_error, int er_ext, int *er_out) {
|
|
2494 |
if (er_out != NULL) {
|
|
2495 |
if (copyout(&er_ext, er_out, sizeof (int))) {
|
|
2496 |
return (set_errno(EFAULT));
|
|
2497 |
}
|
|
2498 |
}
|
|
2499 |
return (set_errno(er_error));
|
|
2500 |
}
|
|
2501 |
|
|
2502 |
/*
|
|
2503 |
* System call to create/initialize a new zone named 'zone_name', rooted
|
|
2504 |
* at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
|
|
2505 |
* and initialized with the zone-wide rctls described in 'rctlbuf'.
|
|
2506 |
*
|
|
2507 |
* If extended error is non-null, we may use it to return more detailed
|
|
2508 |
* error information.
|
|
2509 |
*/
|
|
2510 |
static zoneid_t
|
|
2511 |
zone_create(const char *zone_name, const char *zone_root,
|
|
2512 |
const priv_set_t *zone_privs, caddr_t rctlbuf, size_t rctlbufsz,
|
|
2513 |
int *extended_error)
|
|
2514 |
{
|
|
2515 |
struct zsched_arg zarg;
|
|
2516 |
nvlist_t *rctls = NULL;
|
|
2517 |
proc_t *pp = curproc;
|
|
2518 |
zone_t *zone, *ztmp;
|
|
2519 |
zoneid_t zoneid;
|
|
2520 |
int error;
|
|
2521 |
int error2 = 0;
|
|
2522 |
char *str;
|
|
2523 |
cred_t *zkcr;
|
|
2524 |
|
|
2525 |
if (secpolicy_zone_config(CRED()) != 0)
|
|
2526 |
return (set_errno(EPERM));
|
|
2527 |
|
|
2528 |
/* can't boot zone from within chroot environment */
|
|
2529 |
if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
|
|
2530 |
return (zone_create_error(ENOTSUP, ZE_CHROOTED,
|
|
2531 |
extended_error));
|
|
2532 |
|
|
2533 |
zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
|
|
2534 |
zoneid = zone->zone_id = id_alloc(zoneid_space);
|
|
2535 |
zone->zone_status = ZONE_IS_UNINITIALIZED;
|
|
2536 |
zone->zone_pool = pool_default;
|
|
2537 |
zone->zone_pool_mod = gethrtime();
|
|
2538 |
zone->zone_psetid = ZONE_PS_INVAL;
|
|
2539 |
zone->zone_ncpus = 0;
|
|
2540 |
zone->zone_ncpus_online = 0;
|
|
2541 |
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
2542 |
mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
2543 |
cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
|
|
2544 |
list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
|
|
2545 |
offsetof(struct zsd_entry, zsd_linkage));
|
|
2546 |
|
|
2547 |
if ((error = zone_set_name(zone, zone_name)) != 0) {
|
|
2548 |
zone_free(zone);
|
|
2549 |
return (zone_create_error(error, 0, extended_error));
|
|
2550 |
}
|
|
2551 |
|
|
2552 |
if ((error = zone_set_root(zone, zone_root)) != 0) {
|
|
2553 |
zone_free(zone);
|
|
2554 |
return (zone_create_error(error, 0, extended_error));
|
|
2555 |
}
|
|
2556 |
if ((error = zone_set_privset(zone, zone_privs)) != 0) {
|
|
2557 |
zone_free(zone);
|
|
2558 |
return (zone_create_error(error, 0, extended_error));
|
|
2559 |
}
|
|
2560 |
|
|
2561 |
/* initialize node name to be the same as zone name */
|
|
2562 |
zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
|
|
2563 |
(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
|
|
2564 |
zone->zone_nodename[_SYS_NMLN - 1] = '\0';
|
|
2565 |
|
|
2566 |
zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
|
|
2567 |
zone->zone_domain[0] = '\0';
|
|
2568 |
zone->zone_shares = 1;
|
|
2569 |
zone->zone_bootargs = NULL;
|
|
2570 |
|
|
2571 |
/*
|
|
2572 |
* Zsched initializes the rctls.
|
|
2573 |
*/
|
|
2574 |
zone->zone_rctls = NULL;
|
|
2575 |
|
|
2576 |
if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
|
|
2577 |
zone_free(zone);
|
|
2578 |
return (zone_create_error(error, 0, extended_error));
|
|
2579 |
}
|
|
2580 |
|
|
2581 |
/*
|
|
2582 |
* Stop all lwps since that's what normally happens as part of fork().
|
|
2583 |
* This needs to happen before we grab any locks to avoid deadlock
|
|
2584 |
* (another lwp in the process could be waiting for the held lock).
|
|
2585 |
*/
|
|
2586 |
if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
|
|
2587 |
zone_free(zone);
|
|
2588 |
if (rctls)
|
|
2589 |
nvlist_free(rctls);
|
|
2590 |
return (zone_create_error(error, 0, extended_error));
|
|
2591 |
}
|
|
2592 |
|
|
2593 |
if (block_mounts() == 0) {
|
|
2594 |
mutex_enter(&pp->p_lock);
|
|
2595 |
if (curthread != pp->p_agenttp)
|
|
2596 |
continuelwps(pp);
|
|
2597 |
mutex_exit(&pp->p_lock);
|
|
2598 |
zone_free(zone);
|
|
2599 |
if (rctls)
|
|
2600 |
nvlist_free(rctls);
|
|
2601 |
return (zone_create_error(error, 0, extended_error));
|
|
2602 |
}
|
|
2603 |
|
|
2604 |
/*
|
|
2605 |
* Set up credential for kernel access. After this, any errors
|
|
2606 |
* should go through the dance in errout rather than calling
|
|
2607 |
* zone_free directly.
|
|
2608 |
*/
|
|
2609 |
zone->zone_kcred = crdup(kcred);
|
|
2610 |
crsetzone(zone->zone_kcred, zone);
|
|
2611 |
priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
|
|
2612 |
priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
|
|
2613 |
priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
|
|
2614 |
priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
|
|
2615 |
|
|
2616 |
mutex_enter(&zonehash_lock);
|
|
2617 |
/*
|
|
2618 |
* Make sure zone doesn't already exist.
|
|
2619 |
*/
|
|
2620 |
if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) {
|
|
2621 |
zone_status_t status;
|
|
2622 |
|
|
2623 |
status = zone_status_get(ztmp);
|
|
2624 |
if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
|
|
2625 |
error = EEXIST;
|
|
2626 |
else
|
|
2627 |
error = EBUSY;
|
|
2628 |
goto errout;
|
|
2629 |
}
|
|
2630 |
|
|
2631 |
/*
|
|
2632 |
* Don't allow zone creations which would cause one zone's rootpath to
|
|
2633 |
* be accessible from that of another (non-global) zone.
|
|
2634 |
*/
|
|
2635 |
if (zone_is_nested(zone->zone_rootpath)) {
|
|
2636 |
error = EBUSY;
|
|
2637 |
goto errout;
|
|
2638 |
}
|
|
2639 |
|
|
2640 |
ASSERT(zonecount != 0); /* check for leaks */
|
|
2641 |
if (zonecount + 1 > maxzones) {
|
|
2642 |
error = ENOMEM;
|
|
2643 |
goto errout;
|
|
2644 |
}
|
|
2645 |
|
|
2646 |
if (zone_mount_count(zone->zone_rootpath) != 0) {
|
|
2647 |
error = EBUSY;
|
|
2648 |
error2 = ZE_AREMOUNTS;
|
|
2649 |
goto errout;
|
|
2650 |
}
|
|
2651 |
|
|
2652 |
/*
|
|
2653 |
* Zone is still incomplete, but we need to drop all locks while
|
|
2654 |
* zsched() initializes this zone's kernel process. We
|
|
2655 |
* optimistically add the zone to the hashtable and associated
|
|
2656 |
* lists so a parallel zone_create() doesn't try to create the
|
|
2657 |
* same zone.
|
|
2658 |
*/
|
|
2659 |
zonecount++;
|
|
2660 |
(void) mod_hash_insert(zonehashbyid,
|
|
2661 |
(mod_hash_key_t)(uintptr_t)zone->zone_id,
|
|
2662 |
(mod_hash_val_t)(uintptr_t)zone);
|
|
2663 |
str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
|
|
2664 |
(void) strcpy(str, zone->zone_name);
|
|
2665 |
(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
|
|
2666 |
(mod_hash_val_t)(uintptr_t)zone);
|
|
2667 |
/*
|
|
2668 |
* Insert into active list. At this point there are no 'hold's
|
|
2669 |
* on the zone, but everyone else knows not to use it, so we can
|
|
2670 |
* continue to use it. zsched() will do a zone_hold() if the
|
|
2671 |
* newproc() is successful.
|
|
2672 |
*/
|
|
2673 |
list_insert_tail(&zone_active, zone);
|
|
2674 |
mutex_exit(&zonehash_lock);
|
|
2675 |
|
|
2676 |
zarg.zone = zone;
|
|
2677 |
zarg.nvlist = rctls;
|
|
2678 |
/*
|
|
2679 |
* The process, task, and project rctls are probably wrong;
|
|
2680 |
* we need an interface to get the default values of all rctls,
|
|
2681 |
* and initialize zsched appropriately. I'm not sure that that
|
|
2682 |
* makes much of a difference, though.
|
|
2683 |
*/
|
|
2684 |
if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
|
|
2685 |
/*
|
|
2686 |
* We need to undo all globally visible state.
|
|
2687 |
*/
|
|
2688 |
mutex_enter(&zonehash_lock);
|
|
2689 |
list_remove(&zone_active, zone);
|
|
2690 |
(void) mod_hash_destroy(zonehashbyname,
|
|
2691 |
(mod_hash_key_t)(uintptr_t)zone->zone_name);
|
|
2692 |
(void) mod_hash_destroy(zonehashbyid,
|
|
2693 |
(mod_hash_key_t)(uintptr_t)zone->zone_id);
|
|
2694 |
ASSERT(zonecount > 1);
|
|
2695 |
zonecount--;
|
|
2696 |
goto errout;
|
|
2697 |
}
|
|
2698 |
|
|
2699 |
/*
|
|
2700 |
* Zone creation can't fail from now on.
|
|
2701 |
*/
|
|
2702 |
|
|
2703 |
/*
|
|
2704 |
* Let the other lwps continue.
|
|
2705 |
*/
|
|
2706 |
mutex_enter(&pp->p_lock);
|
|
2707 |
if (curthread != pp->p_agenttp)
|
|
2708 |
continuelwps(pp);
|
|
2709 |
mutex_exit(&pp->p_lock);
|
|
2710 |
|
|
2711 |
/*
|
|
2712 |
* Wait for zsched to finish initializing the zone.
|
|
2713 |
*/
|
|
2714 |
zone_status_wait(zone, ZONE_IS_READY);
|
|
2715 |
/*
|
|
2716 |
* The zone is fully visible, so we can let mounts progress.
|
|
2717 |
*/
|
|
2718 |
resume_mounts();
|
|
2719 |
if (rctls)
|
|
2720 |
nvlist_free(rctls);
|
|
2721 |
|
|
2722 |
return (zoneid);
|
|
2723 |
|
|
2724 |
errout:
|
|
2725 |
mutex_exit(&zonehash_lock);
|
|
2726 |
/*
|
|
2727 |
* Let the other lwps continue.
|
|
2728 |
*/
|
|
2729 |
mutex_enter(&pp->p_lock);
|
|
2730 |
if (curthread != pp->p_agenttp)
|
|
2731 |
continuelwps(pp);
|
|
2732 |
mutex_exit(&pp->p_lock);
|
|
2733 |
|
|
2734 |
resume_mounts();
|
|
2735 |
if (rctls)
|
|
2736 |
nvlist_free(rctls);
|
|
2737 |
/*
|
|
2738 |
* There is currently one reference to the zone, a cred_ref from
|
|
2739 |
* zone_kcred. To free the zone, we call crfree, which will call
|
|
2740 |
* zone_cred_rele, which will call zone_free.
|
|
2741 |
*/
|
|
2742 |
ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */
|
|
2743 |
ASSERT(zone->zone_kcred->cr_ref == 1);
|
|
2744 |
ASSERT(zone->zone_ref == 0);
|
|
2745 |
zkcr = zone->zone_kcred;
|
|
2746 |
zone->zone_kcred = NULL;
|
|
2747 |
crfree(zkcr); /* triggers call to zone_free */
|
|
2748 |
return (zone_create_error(error, error2, extended_error));
|
|
2749 |
}
|
|
2750 |
|
|
2751 |
/*
|
|
2752 |
* Cause the zone to boot. This is pretty simple, since we let zoneadmd do
|
|
2753 |
* the heavy lifting.
|
|
2754 |
*/
|
|
2755 |
static int
|
|
2756 |
zone_boot(zoneid_t zoneid, const char *bootargs)
|
|
2757 |
{
|
|
2758 |
int err;
|
|
2759 |
zone_t *zone;
|
|
2760 |
|
|
2761 |
if (secpolicy_zone_config(CRED()) != 0)
|
|
2762 |
return (set_errno(EPERM));
|
|
2763 |
if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
|
|
2764 |
return (set_errno(EINVAL));
|
|
2765 |
|
|
2766 |
mutex_enter(&zonehash_lock);
|
|
2767 |
/*
|
|
2768 |
* Look for zone under hash lock to prevent races with calls to
|
|
2769 |
* zone_shutdown, zone_destroy, etc.
|
|
2770 |
*/
|
|
2771 |
if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
|
|
2772 |
mutex_exit(&zonehash_lock);
|
|
2773 |
return (set_errno(EINVAL));
|
|
2774 |
}
|
|
2775 |
|
|
2776 |
if ((err = zone_set_bootargs(zone, bootargs)) != 0) {
|
|
2777 |
mutex_exit(&zonehash_lock);
|
|
2778 |
return (set_errno(err));
|
|
2779 |
}
|
|
2780 |
|
|
2781 |
mutex_enter(&zone_status_lock);
|
|
2782 |
if (zone_status_get(zone) != ZONE_IS_READY) {
|
|
2783 |
mutex_exit(&zone_status_lock);
|
|
2784 |
mutex_exit(&zonehash_lock);
|
|
2785 |
return (set_errno(EINVAL));
|
|
2786 |
}
|
|
2787 |
zone_status_set(zone, ZONE_IS_BOOTING);
|
|
2788 |
mutex_exit(&zone_status_lock);
|
|
2789 |
|
|
2790 |
zone_hold(zone); /* so we can use the zone_t later */
|
|
2791 |
mutex_exit(&zonehash_lock);
|
|
2792 |
|
|
2793 |
if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
|
|
2794 |
zone_rele(zone);
|
|
2795 |
return (set_errno(EINTR));
|
|
2796 |
}
|
|
2797 |
|
|
2798 |
/*
|
|
2799 |
* Boot (starting init) might have failed, in which case the zone
|
|
2800 |
* will go to the SHUTTING_DOWN state; an appropriate errno will
|
|
2801 |
* be placed in zone->zone_boot_err, and so we return that.
|
|
2802 |
*/
|
|
2803 |
err = zone->zone_boot_err;
|
|
2804 |
zone_rele(zone);
|
|
2805 |
return (err ? set_errno(err) : 0);
|
|
2806 |
}
|
|
2807 |
|
|
2808 |
/*
|
|
2809 |
* Kills all user processes in the zone, waiting for them all to exit
|
|
2810 |
* before returning.
|
|
2811 |
*/
|
|
2812 |
static int
|
|
2813 |
zone_empty(zone_t *zone)
|
|
2814 |
{
|
|
2815 |
int waitstatus;
|
|
2816 |
|
|
2817 |
/*
|
|
2818 |
* We need to drop zonehash_lock before killing all
|
|
2819 |
* processes, otherwise we'll deadlock with zone_find_*
|
|
2820 |
* which can be called from the exit path.
|
|
2821 |
*/
|
|
2822 |
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
|
|
2823 |
while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz,
|
|
2824 |
ZONE_IS_EMPTY)) == -1) {
|
|
2825 |
killall(zone->zone_id);
|
|
2826 |
}
|
|
2827 |
/*
|
|
2828 |
* return EINTR if we were signaled
|
|
2829 |
*/
|
|
2830 |
if (waitstatus == 0)
|
|
2831 |
return (EINTR);
|
|
2832 |
return (0);
|
|
2833 |
}
|
|
2834 |
|
|
2835 |
/*
|
|
2836 |
* Systemcall to start the zone's halt sequence. By the time this
|
|
2837 |
* function successfully returns, all user processes and kernel threads
|
|
2838 |
* executing in it will have exited, ZSD shutdown callbacks executed,
|
|
2839 |
* and the zone status set to ZONE_IS_DOWN.
|
|
2840 |
*
|
|
2841 |
* It is possible that the call will interrupt itself if the caller is the
|
|
2842 |
* parent of any process running in the zone, and doesn't have SIGCHLD blocked.
|
|
2843 |
*/
|
|
2844 |
static int
|
|
2845 |
zone_shutdown(zoneid_t zoneid)
|
|
2846 |
{
|
|
2847 |
int error;
|
|
2848 |
zone_t *zone;
|
|
2849 |
zone_status_t status;
|
|
2850 |
|
|
2851 |
if (secpolicy_zone_config(CRED()) != 0)
|
|
2852 |
return (set_errno(EPERM));
|
|
2853 |
if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
|
|
2854 |
return (set_errno(EINVAL));
|
|
2855 |
|
|
2856 |
/*
|
|
2857 |
* Block mounts so that VFS_MOUNT() can get an accurate view of
|
|
2858 |
* the zone's status with regards to ZONE_IS_SHUTTING down.
|
|
2859 |
*
|
|
2860 |
* e.g. NFS can fail the mount if it determines that the zone
|
|
2861 |
* has already begun the shutdown sequence.
|
|
2862 |
*/
|
|
2863 |
if (block_mounts() == 0)
|
|
2864 |
return (set_errno(EINTR));
|
|
2865 |
mutex_enter(&zonehash_lock);
|
|
2866 |
/*
|
|
2867 |
* Look for zone under hash lock to prevent races with other
|
|
2868 |
* calls to zone_shutdown and zone_destroy.
|
|
2869 |
*/
|
|
2870 |
if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
|
|
2871 |
mutex_exit(&zonehash_lock);
|
|
2872 |
resume_mounts();
|
|
2873 |
return (set_errno(EINVAL));
|
|
2874 |
}
|
|
2875 |
mutex_enter(&zone_status_lock);
|
|
2876 |
status = zone_status_get(zone);
|
|
2877 |
/*
|
|
2878 |
* Fail if the zone isn't fully initialized yet.
|
|
2879 |
*/
|
|
2880 |
if (status < ZONE_IS_READY) {
|
|
2881 |
mutex_exit(&zone_status_lock);
|
|
2882 |
mutex_exit(&zonehash_lock);
|
|
2883 |
resume_mounts();
|
|
2884 |
return (set_errno(EINVAL));
|
|
2885 |
}
|
|
2886 |
/*
|
|
2887 |
* If conditions required for zone_shutdown() to return have been met,
|
|
2888 |
* return success.
|
|
2889 |
*/
|
|
2890 |
if (status >= ZONE_IS_DOWN) {
|
|
2891 |
mutex_exit(&zone_status_lock);
|
|
2892 |
mutex_exit(&zonehash_lock);
|
|
2893 |
resume_mounts();
|
|
2894 |
return (0);
|
|
2895 |
}
|
|
2896 |
/*
|
|
2897 |
* If zone_shutdown() hasn't been called before, go through the motions.
|
|
2898 |
* If it has, there's nothing to do but wait for the kernel threads to
|
|
2899 |
* drain.
|
|
2900 |
*/
|
|
2901 |
if (status < ZONE_IS_EMPTY) {
|
|
2902 |
uint_t ntasks;
|
|
2903 |
|
|
2904 |
mutex_enter(&zone->zone_lock);
|
|
2905 |
if ((ntasks = zone->zone_ntasks) != 1) {
|
|
2906 |
/*
|
|
2907 |
* There's still stuff running.
|
|
2908 |
*/
|
|
2909 |
zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
|
|
2910 |
}
|
|
2911 |
mutex_exit(&zone->zone_lock);
|
|
2912 |
if (ntasks == 1) {
|
|
2913 |
/*
|
|
2914 |
* The only way to create another task is through
|
|
2915 |
* zone_enter(), which will block until we drop
|
|
2916 |
* zonehash_lock. The zone is empty.
|
|
2917 |
*/
|
|
2918 |
if (zone->zone_kthreads == NULL) {
|
|
2919 |
/*
|
|
2920 |
* Skip ahead to ZONE_IS_DOWN
|
|
2921 |
*/
|
|
2922 |
zone_status_set(zone, ZONE_IS_DOWN);
|
|
2923 |
} else {
|
|
2924 |
zone_status_set(zone, ZONE_IS_EMPTY);
|
|
2925 |
}
|
|
2926 |
}
|
|
2927 |
}
|
|
2928 |
zone_hold(zone); /* so we can use the zone_t later */
|
|
2929 |
mutex_exit(&zone_status_lock);
|
|
2930 |
mutex_exit(&zonehash_lock);
|
|
2931 |
resume_mounts();
|
|
2932 |
|
|
2933 |
if (error = zone_empty(zone)) {
|
|
2934 |
zone_rele(zone);
|
|
2935 |
return (set_errno(error));
|
|
2936 |
}
|
|
2937 |
/*
|
|
2938 |
* After the zone status goes to ZONE_IS_DOWN this zone will no
|
|
2939 |
* longer be notified of changes to the pools configuration, so
|
|
2940 |
* in order to not end up with a stale pool pointer, we point
|
|
2941 |
* ourselves at the default pool and remove all resource
|
|
2942 |
* visibility. This is especially important as the zone_t may
|
|
2943 |
* languish on the deathrow for a very long time waiting for
|
|
2944 |
* cred's to drain out.
|
|
2945 |
*
|
|
2946 |
* This rebinding of the zone can happen multiple times
|
|
2947 |
* (presumably due to interrupted or parallel systemcalls)
|
|
2948 |
* without any adverse effects.
|
|
2949 |
*/
|
|
2950 |
if (pool_lock_intr() != 0) {
|
|
2951 |
zone_rele(zone);
|
|
2952 |
return (set_errno(EINTR));
|
|
2953 |
}
|
|
2954 |
if (pool_state == POOL_ENABLED) {
|
|
2955 |
mutex_enter(&cpu_lock);
|
|
2956 |
zone_pool_set(zone, pool_default);
|
|
2957 |
/*
|
|
2958 |
* The zone no longer needs to be able to see any cpus.
|
|
2959 |
*/
|
|
2960 |
zone_pset_set(zone, ZONE_PS_INVAL);
|
|
2961 |
mutex_exit(&cpu_lock);
|
|
2962 |
}
|
|
2963 |
pool_unlock();
|
|
2964 |
|
|
2965 |
/*
|
|
2966 |
* ZSD shutdown callbacks can be executed multiple times, hence
|
|
2967 |
* it is safe to not be holding any locks across this call.
|
|
2968 |
*/
|
|
2969 |
zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
|
|
2970 |
|
|
2971 |
mutex_enter(&zone_status_lock);
|
|
2972 |
if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
|
|
2973 |
zone_status_set(zone, ZONE_IS_DOWN);
|
|
2974 |
mutex_exit(&zone_status_lock);
|
|
2975 |
|
|
2976 |
/*
|
|
2977 |
* Wait for kernel threads to drain.
|
|
2978 |
*/
|
|
2979 |
if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
|
|
2980 |
zone_rele(zone);
|
|
2981 |
return (set_errno(EINTR));
|
|
2982 |
}
|
|
2983 |
zone_rele(zone);
|
|
2984 |
return (0);
|
|
2985 |
}
|
|
2986 |
|
|
2987 |
/*
|
|
2988 |
* Systemcall entry point to finalize the zone halt process. The caller
|
|
2989 |
* must have already successfully callefd zone_shutdown().
|
|
2990 |
*
|
|
2991 |
* Upon successful completion, the zone will have been fully destroyed:
|
|
2992 |
* zsched will have exited, destructor callbacks executed, and the zone
|
|
2993 |
* removed from the list of active zones.
|
|
2994 |
*/
|
|
2995 |
static int
|
|
2996 |
zone_destroy(zoneid_t zoneid)
|
|
2997 |
{
|
|
2998 |
uint64_t uniqid;
|
|
2999 |
zone_t *zone;
|
|
3000 |
zone_status_t status;
|
|
3001 |
|
|
3002 |
if (secpolicy_zone_config(CRED()) != 0)
|
|
3003 |
return (set_errno(EPERM));
|
|
3004 |
if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
|
|
3005 |
return (set_errno(EINVAL));
|
|
3006 |
|
|
3007 |
mutex_enter(&zonehash_lock);
|
|
3008 |
/*
|
|
3009 |
* Look for zone under hash lock to prevent races with other
|
|
3010 |
* calls to zone_destroy.
|
|
3011 |
*/
|
|
3012 |
if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
|
|
3013 |
mutex_exit(&zonehash_lock);
|
|
3014 |
return (set_errno(EINVAL));
|
|
3015 |
}
|
|
3016 |
|
|
3017 |
if (zone_mount_count(zone->zone_rootpath) != 0) {
|
|
3018 |
mutex_exit(&zonehash_lock);
|
|
3019 |
return (set_errno(EBUSY));
|
|
3020 |
}
|
|
3021 |
mutex_enter(&zone_status_lock);
|
|
3022 |
status = zone_status_get(zone);
|
|
3023 |
if (status < ZONE_IS_DOWN) {
|
|
3024 |
mutex_exit(&zone_status_lock);
|
|
3025 |
mutex_exit(&zonehash_lock);
|
|
3026 |
return (set_errno(EBUSY));
|
|
3027 |
} else if (status == ZONE_IS_DOWN) {
|
|
3028 |
zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
|
|
3029 |
}
|
|
3030 |
mutex_exit(&zone_status_lock);
|
|
3031 |
zone_hold(zone);
|
|
3032 |
mutex_exit(&zonehash_lock);
|
|
3033 |
|
|
3034 |
/*
|
|
3035 |
* wait for zsched to exit
|
|
3036 |
*/
|
|
3037 |
zone_status_wait(zone, ZONE_IS_DEAD);
|
|
3038 |
zone_zsd_callbacks(zone, ZSD_DESTROY);
|
|
3039 |
uniqid = zone->zone_uniqid;
|
|
3040 |
zone_rele(zone);
|
|
3041 |
zone = NULL; /* potentially free'd */
|
|
3042 |
|
|
3043 |
mutex_enter(&zonehash_lock);
|
|
3044 |
for (; /* ever */; ) {
|
|
3045 |
boolean_t unref;
|
|
3046 |
|
|
3047 |
if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
|
|
3048 |
zone->zone_uniqid != uniqid) {
|
|
3049 |
/*
|
|
3050 |
* The zone has gone away. Necessary conditions
|
|
3051 |
* are met, so we return success.
|
|
3052 |
*/
|
|
3053 |
mutex_exit(&zonehash_lock);
|
|
3054 |
return (0);
|
|
3055 |
}
|
|
3056 |
mutex_enter(&zone->zone_lock);
|
|
3057 |
unref = ZONE_IS_UNREF(zone);
|
|
3058 |
mutex_exit(&zone->zone_lock);
|
|
3059 |
if (unref) {
|
|
3060 |
/*
|
|
3061 |
* There is only one reference to the zone -- that
|
|
3062 |
* added when the zone was added to the hashtables --
|
|
3063 |
* and things will remain this way until we drop
|
|
3064 |
* zonehash_lock... we can go ahead and cleanup the
|
|
3065 |
* zone.
|
|
3066 |
*/
|
|
3067 |
break;
|
|
3068 |
}
|
|
3069 |
|
|
3070 |
if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
|
|
3071 |
/* Signaled */
|
|
3072 |
mutex_exit(&zonehash_lock);
|
|
3073 |
return (set_errno(EINTR));
|
|
3074 |
}
|
|
3075 |
|
|
3076 |
}
|
|
3077 |
|
|
3078 |
/*
|
|
3079 |
* It is now safe to let the zone be recreated; remove it from the
|
|
3080 |
* lists. The memory will not be freed until the last cred
|
|
3081 |
* reference goes away.
|
|
3082 |
*/
|
|
3083 |
ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */
|
|
3084 |
zonecount--;
|
|
3085 |
/* remove from active list and hash tables */
|
|
3086 |
list_remove(&zone_active, zone);
|
|
3087 |
(void) mod_hash_destroy(zonehashbyname,
|
|
3088 |
(mod_hash_key_t)zone->zone_name);
|
|
3089 |
(void) mod_hash_destroy(zonehashbyid,
|
|
3090 |
(mod_hash_key_t)(uintptr_t)zone->zone_id);
|
|
3091 |
mutex_exit(&zonehash_lock);
|
|
3092 |
|
766
|
3093 |
/*
|
|
3094 |
* Release the root vnode; we're not using it anymore. Nor should any
|
|
3095 |
* other thread that might access it exist.
|
|
3096 |
*/
|
|
3097 |
if (zone->zone_rootvp != NULL) {
|
|
3098 |
VN_RELE(zone->zone_rootvp);
|
|
3099 |
zone->zone_rootvp = NULL;
|
|
3100 |
}
|
|
3101 |
|
0
|
3102 |
/* add to deathrow list */
|
|
3103 |
mutex_enter(&zone_deathrow_lock);
|
|
3104 |
list_insert_tail(&zone_deathrow, zone);
|
|
3105 |
mutex_exit(&zone_deathrow_lock);
|
|
3106 |
|
|
3107 |
/*
|
|
3108 |
* Drop last reference (which was added by zsched()), this will
|
|
3109 |
* free the zone unless there are outstanding cred references.
|
|
3110 |
*/
|
|
3111 |
zone_rele(zone);
|
|
3112 |
return (0);
|
|
3113 |
}
|
|
3114 |
|
|
3115 |
/*
|
|
3116 |
* Systemcall entry point for zone_getattr(2).
|
|
3117 |
*/
|
|
3118 |
static ssize_t
|
|
3119 |
zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
|
|
3120 |
{
|
|
3121 |
size_t size;
|
|
3122 |
int error = 0, err;
|
|
3123 |
zone_t *zone;
|
|
3124 |
char *zonepath;
|
|
3125 |
zone_status_t zone_status;
|
|
3126 |
pid_t initpid;
|
|
3127 |
boolean_t global = (curproc->p_zone == global_zone);
|
|
3128 |
|
|
3129 |
mutex_enter(&zonehash_lock);
|
|
3130 |
if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
|
|
3131 |
mutex_exit(&zonehash_lock);
|
|
3132 |
return (set_errno(EINVAL));
|
|
3133 |
}
|
|
3134 |
zone_status = zone_status_get(zone);
|
|
3135 |
if (zone_status < ZONE_IS_READY) {
|
|
3136 |
mutex_exit(&zonehash_lock);
|
|
3137 |
return (set_errno(EINVAL));
|
|
3138 |
}
|
|
3139 |
zone_hold(zone);
|
|
3140 |
mutex_exit(&zonehash_lock);
|
|
3141 |
|
|
3142 |
/*
|
|
3143 |
* If not in the global zone, don't show information about other zones.
|
|
3144 |
*/
|
|
3145 |
if (!global && curproc->p_zone != zone) {
|
|
3146 |
zone_rele(zone);
|
|
3147 |
return (set_errno(EINVAL));
|
|
3148 |
}
|
|
3149 |
|
|
3150 |
switch (attr) {
|
|
3151 |
case ZONE_ATTR_ROOT:
|
|
3152 |
if (global) {
|
|
3153 |
/*
|
|
3154 |
* Copy the path to trim the trailing "/" (except for
|
|
3155 |
* the global zone).
|
|
3156 |
*/
|
|
3157 |
if (zone != global_zone)
|
|
3158 |
size = zone->zone_rootpathlen - 1;
|
|
3159 |
else
|
|
3160 |
size = zone->zone_rootpathlen;
|
|
3161 |
zonepath = kmem_alloc(size, KM_SLEEP);
|
|
3162 |
bcopy(zone->zone_rootpath, zonepath, size);
|
|
3163 |
zonepath[size - 1] = '\0';
|
|
3164 |
} else {
|
|
3165 |
/*
|
|
3166 |
* Caller is not in the global zone, just return
|
|
3167 |
* faked-up path for current zone.
|
|
3168 |
*/
|
|
3169 |
zonepath = "/";
|
|
3170 |
size = 2;
|
|
3171 |
}
|
|
3172 |
if (bufsize > size)
|
|
3173 |
bufsize = size;
|
|
3174 |
if (buf != NULL) {
|
|
3175 |
err = copyoutstr(zonepath, buf, bufsize, NULL);
|
|
3176 |
if (err != 0 && err != ENAMETOOLONG)
|
|
3177 |
error = EFAULT;
|
|
3178 |
}
|
|
3179 |
if (global)
|
|
3180 |
kmem_free(zonepath, size);
|
|
3181 |
break;
|
|
3182 |
|
|
3183 |
case ZONE_ATTR_NAME:
|
|
3184 |
size = strlen(zone->zone_name) + 1;
|
|
3185 |
if (bufsize > size)
|
|
3186 |
bufsize = size;
|
|
3187 |
if (buf != NULL) {
|
|
3188 |
err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
|
|
3189 |
if (err != 0 && err != ENAMETOOLONG)
|
|
3190 |
error = EFAULT;
|
|
3191 |
}
|
|
3192 |
break;
|
|
3193 |
|
|
3194 |
case ZONE_ATTR_STATUS:
|
|
3195 |
/*
|
|
3196 |
* Since we're not holding zonehash_lock, the zone status
|
|
3197 |
* may be anything; leave it up to userland to sort it out.
|
|
3198 |
*/
|
|
3199 |
size = sizeof (zone_status);
|
|
3200 |
if (bufsize > size)
|
|
3201 |
bufsize = size;
|
|
3202 |
zone_status = zone_status_get(zone);
|
|
3203 |
if (buf != NULL &&
|
|
3204 |
copyout(&zone_status, buf, bufsize) != 0)
|
|
3205 |
error = EFAULT;
|
|
3206 |
break;
|
|
3207 |
case ZONE_ATTR_PRIVSET:
|
|
3208 |
size = sizeof (priv_set_t);
|
|
3209 |
if (bufsize > size)
|
|
3210 |
bufsize = size;
|
|
3211 |
if (buf != NULL &&
|
|
3212 |
copyout(zone->zone_privset, buf, bufsize) != 0)
|
|
3213 |
error = EFAULT;
|
|
3214 |
break;
|
|
3215 |
case ZONE_ATTR_UNIQID:
|
|
3216 |
size = sizeof (zone->zone_uniqid);
|
|
3217 |
if (bufsize > size)
|
|
3218 |
bufsize = size;
|
|
3219 |
if (buf != NULL &&
|
|
3220 |
copyout(&zone->zone_uniqid, buf, bufsize) != 0)
|
|
3221 |
error = EFAULT;
|
|
3222 |
break;
|
|
3223 |
case ZONE_ATTR_POOLID:
|
|
3224 |
{
|
|
3225 |
pool_t *pool;
|
|
3226 |
poolid_t poolid;
|
|
3227 |
|
|
3228 |
if (pool_lock_intr() != 0) {
|
|
3229 |
error = EINTR;
|
|
3230 |
break;
|
|
3231 |
}
|
|
3232 |
pool = zone_pool_get(zone);
|
|
3233 |
poolid = pool->pool_id;
|
|
3234 |
pool_unlock();
|
|
3235 |
size = sizeof (poolid);
|
|
3236 |
if (bufsize > size)
|
|
3237 |
bufsize = size;
|
|
3238 |
if (buf != NULL && copyout(&poolid, buf, size) != 0)
|
|
3239 |
error = EFAULT;
|
|
3240 |
}
|
|
3241 |
break;
|
|
3242 |
case ZONE_ATTR_INITPID:
|
|
3243 |
size = sizeof (initpid);
|
|
3244 |
if (bufsize > size)
|
|
3245 |
bufsize = size;
|
|
3246 |
initpid = zone->zone_proc_initpid;
|
|
3247 |
if (initpid == -1) {
|
|
3248 |
error = ESRCH;
|
|
3249 |
break;
|
|
3250 |
}
|
|
3251 |
if (buf != NULL &&
|
|
3252 |
copyout(&initpid, buf, bufsize) != 0)
|
|
3253 |
error = EFAULT;
|
|
3254 |
break;
|
|
3255 |
default:
|
|
3256 |
error = EINVAL;
|
|
3257 |
}
|
|
3258 |
zone_rele(zone);
|
|
3259 |
|
|
3260 |
if (error)
|
|
3261 |
return (set_errno(error));
|
|
3262 |
return ((ssize_t)size);
|
|
3263 |
}
|
|
3264 |
|
|
3265 |
/*
|
|
3266 |
* Return zero if the process has at least one vnode mapped in to its
|
|
3267 |
* address space which shouldn't be allowed to change zones.
|
|
3268 |
*/
|
|
3269 |
static int
|
|
3270 |
as_can_change_zones(void)
|
|
3271 |
{
|
|
3272 |
proc_t *pp = curproc;
|
|
3273 |
struct seg *seg;
|
|
3274 |
struct as *as = pp->p_as;
|
|
3275 |
vnode_t *vp;
|
|
3276 |
int allow = 1;
|
|
3277 |
|
|
3278 |
ASSERT(pp->p_as != &kas);
|
|
3279 |
AS_LOCK_ENTER(&as, &as->a_lock, RW_READER);
|
|
3280 |
for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
|
|
3281 |
/*
|
|
3282 |
* if we can't get a backing vnode for this segment then skip
|
|
3283 |
* it.
|
|
3284 |
*/
|
|
3285 |
vp = NULL;
|
|
3286 |
if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
|
|
3287 |
continue;
|
|
3288 |
if (!vn_can_change_zones(vp)) { /* bail on first match */
|
|
3289 |
allow = 0;
|
|
3290 |
break;
|
|
3291 |
}
|
|
3292 |
}
|
|
3293 |
AS_LOCK_EXIT(&as, &as->a_lock);
|
|
3294 |
return (allow);
|
|
3295 |
}
|
|
3296 |
|
|
3297 |
/*
|
|
3298 |
* Systemcall entry point for zone_enter().
|
|
3299 |
*
|
|
3300 |
* The current process is injected into said zone. In the process
|
|
3301 |
* it will change its project membership, privileges, rootdir/cwd,
|
|
3302 |
* zone-wide rctls, and pool association to match those of the zone.
|
|
3303 |
*
|
|
3304 |
* The first zone_enter() called while the zone is in the ZONE_IS_READY
|
|
3305 |
* state will transition it to ZONE_IS_RUNNING. Processes may only
|
|
3306 |
* enter a zone that is "ready" or "running".
|
|
3307 |
*/
|
|
3308 |
static int
|
|
3309 |
zone_enter(zoneid_t zoneid)
|
|
3310 |
{
|
|
3311 |
zone_t *zone;
|
|
3312 |
vnode_t *vp;
|
|
3313 |
proc_t *pp = curproc;
|
|
3314 |
contract_t *ct;
|
|
3315 |
cont_process_t *ctp;
|
|
3316 |
task_t *tk, *oldtk;
|
|
3317 |
kproject_t *zone_proj0;
|
|
3318 |
cred_t *cr, *newcr;
|
|
3319 |
pool_t *oldpool, *newpool;
|
|
3320 |
sess_t *sp;
|
|
3321 |
uid_t uid;
|
|
3322 |
zone_status_t status;
|
|
3323 |
int err = 0;
|
|
3324 |
rctl_entity_p_t e;
|
|
3325 |
|
|
3326 |
if (secpolicy_zone_config(CRED()) != 0)
|
|
3327 |
return (set_errno(EPERM));
|
|
3328 |
if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
|
|
3329 |
return (set_errno(EINVAL));
|
|
3330 |
|
|
3331 |
/*
|
|
3332 |
* Stop all lwps so we don't need to hold a lock to look at
|
|
3333 |
* curproc->p_zone. This needs to happen before we grab any
|
|
3334 |
* locks to avoid deadlock (another lwp in the process could
|
|
3335 |
* be waiting for the held lock).
|
|
3336 |
*/
|
|
3337 |
if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
|
|
3338 |
return (set_errno(EINTR));
|
|
3339 |
|
|
3340 |
/*
|
|
3341 |
* Make sure we're not changing zones with files open or mapped in
|
|
3342 |
* to our address space which shouldn't be changing zones.
|
|
3343 |
*/
|
|
3344 |
if (!files_can_change_zones()) {
|
|
3345 |
err = EBADF;
|
|
3346 |
goto out;
|
|
3347 |
}
|
|
3348 |
if (!as_can_change_zones()) {
|
|
3349 |
err = EFAULT;
|
|
3350 |
goto out;
|
|
3351 |
}
|
|
3352 |
|
|
3353 |
mutex_enter(&zonehash_lock);
|
|
3354 |
if (pp->p_zone != global_zone) {
|
|
3355 |
mutex_exit(&zonehash_lock);
|
|
3356 |
err = EINVAL;
|
|
3357 |
goto out;
|
|
3358 |
}
|
|
3359 |
|
|
3360 |
zone = zone_find_all_by_id(zoneid);
|
|
3361 |
if (zone == NULL) {
|
|
3362 |
mutex_exit(&zonehash_lock);
|
|
3363 |
err = EINVAL;
|
|
3364 |
goto out;
|
|
3365 |
}
|
|
3366 |
|
|
3367 |
/*
|
|
3368 |
* To prevent processes in a zone from holding contracts on
|
|
3369 |
* extrazonal resources, and to avoid process contract
|
|
3370 |
* memberships which span zones, contract holders and processes
|
|
3371 |
* which aren't the sole members of their encapsulating process
|
|
3372 |
* contracts are not allowed to zone_enter.
|
|
3373 |
*/
|
|
3374 |
ctp = pp->p_ct_process;
|
|
3375 |
ct = &ctp->conp_contract;
|
|
3376 |
mutex_enter(&ct->ct_lock);
|
|
3377 |
mutex_enter(&pp->p_lock);
|
|
3378 |
if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
|
|
3379 |
mutex_exit(&pp->p_lock);
|
|
3380 |
mutex_exit(&ct->ct_lock);
|
|
3381 |
mutex_exit(&zonehash_lock);
|
|
3382 |
pool_unlock();
|
|
3383 |
err = EINVAL;
|
|
3384 |
goto out;
|
|
3385 |
}
|
|
3386 |
|
|
3387 |
/*
|
|
3388 |
* Moreover, we don't allow processes whose encapsulating
|
|
3389 |
* process contracts have inherited extrazonal contracts.
|
|
3390 |
* While it would be easier to eliminate all process contracts
|
|
3391 |
* with inherited contracts, we need to be able to give a
|
|
3392 |
* restarted init (or other zone-penetrating process) its
|
|
3393 |
* predecessor's contracts.
|
|
3394 |
*/
|
|
3395 |
if (ctp->conp_ninherited != 0) {
|
|
3396 |
contract_t *next;
|
|
3397 |
for (next = list_head(&ctp->conp_inherited); next;
|
|
3398 |
next = list_next(&ctp->conp_inherited, next)) {
|
|
3399 |
if (contract_getzuniqid(next) != zone->zone_uniqid) {
|
|
3400 |
mutex_exit(&pp->p_lock);
|
|
3401 |
mutex_exit(&ct->ct_lock);
|
|
3402 |
mutex_exit(&zonehash_lock);
|
|
3403 |
pool_unlock();
|
|
3404 |
err = EINVAL;
|
|
3405 |
goto out;
|
|
3406 |
}
|
|
3407 |
}
|
|
3408 |
}
|
|
3409 |
mutex_exit(&pp->p_lock);
|
|
3410 |
mutex_exit(&ct->ct_lock);
|
|
3411 |
|
|
3412 |
status = zone_status_get(zone);
|
|
3413 |
if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
|
|
3414 |
/*
|
|
3415 |
* Can't join
|
|
3416 |
*/
|
|
3417 |
mutex_exit(&zonehash_lock);
|
|
3418 |
err = EINVAL;
|
|
3419 |
goto out;
|
|
3420 |
}
|
|
3421 |
|
|
3422 |
/*
|
|
3423 |
* Make sure new priv set is within the permitted set for caller
|
|
3424 |
*/
|
|
3425 |
if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
|
|
3426 |
mutex_exit(&zonehash_lock);
|
|
3427 |
err = EPERM;
|
|
3428 |
goto out;
|
|
3429 |
}
|
|
3430 |
/*
|
|
3431 |
* We want to momentarily drop zonehash_lock while we optimistically
|
|
3432 |
* bind curproc to the pool it should be running in. This is safe
|
|
3433 |
* since the zone can't disappear (we have a hold on it).
|
|
3434 |
*/
|
|
3435 |
zone_hold(zone);
|
|
3436 |
mutex_exit(&zonehash_lock);
|
|
3437 |
|
|
3438 |
/*
|
|
3439 |
* Grab pool_lock to keep the pools configuration from changing
|
|
3440 |
* and to stop ourselves from getting rebound to another pool
|
|
3441 |
* until we join the zone.
|
|
3442 |
*/
|
|
3443 |
if (pool_lock_intr() != 0) {
|
|
3444 |
zone_rele(zone);
|
|
3445 |
err = EINTR;
|
|
3446 |
goto out;
|
|
3447 |
}
|
|
3448 |
ASSERT(secpolicy_pool(CRED()) == 0);
|
|
3449 |
/*
|
|
3450 |
* Bind ourselves to the pool currently associated with the zone.
|
|
3451 |
*/
|
|
3452 |
oldpool = curproc->p_pool;
|
|
3453 |
newpool = zone_pool_get(zone);
|
|
3454 |
if (pool_state == POOL_ENABLED && newpool != oldpool &&
|
|
3455 |
(err = pool_do_bind(newpool, P_PID, P_MYID,
|
|
3456 |
POOL_BIND_ALL)) != 0) {
|
|
3457 |
pool_unlock();
|
|
3458 |
zone_rele(zone);
|
|
3459 |
goto out;
|
|
3460 |
}
|
|
3461 |
|
|
3462 |
/*
|
|
3463 |
* Grab cpu_lock now; we'll need it later when we call
|
|
3464 |
* task_join().
|
|
3465 |
*/
|
|
3466 |
mutex_enter(&cpu_lock);
|
|
3467 |
mutex_enter(&zonehash_lock);
|
|
3468 |
/*
|
|
3469 |
* Make sure the zone hasn't moved on since we dropped zonehash_lock.
|
|
3470 |
*/
|
|
3471 |
if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
|
|
3472 |
/*
|
|
3473 |
* Can't join anymore.
|
|
3474 |
*/
|
|
3475 |
mutex_exit(&zonehash_lock);
|
|
3476 |
mutex_exit(&cpu_lock);
|
|
3477 |
if (pool_state == POOL_ENABLED &&
|
|
3478 |
newpool != oldpool)
|
|
3479 |
(void) pool_do_bind(oldpool, P_PID, P_MYID,
|
|
3480 |
POOL_BIND_ALL);
|
|
3481 |
pool_unlock();
|
|
3482 |
zone_rele(zone);
|
|
3483 |
err = EINVAL;
|
|
3484 |
goto out;
|
|
3485 |
}
|
|
3486 |
|
|
3487 |
mutex_enter(&pp->p_lock);
|
|
3488 |
zone_proj0 = zone->zone_zsched->p_task->tk_proj;
|
|
3489 |
/* verify that we do not exceed and task or lwp limits */
|
|
3490 |
mutex_enter(&zone->zone_nlwps_lock);
|
|
3491 |
/* add new lwps to zone and zone's proj0 */
|
|
3492 |
zone_proj0->kpj_nlwps += pp->p_lwpcnt;
|
|
3493 |
zone->zone_nlwps += pp->p_lwpcnt;
|
|
3494 |
/* add 1 task to zone's proj0 */
|
|
3495 |
zone_proj0->kpj_ntasks += 1;
|
|
3496 |
mutex_exit(&pp->p_lock);
|
|
3497 |
mutex_exit(&zone->zone_nlwps_lock);
|
|
3498 |
|
|
3499 |
/* remove lwps from proc's old zone and old project */
|
|
3500 |
mutex_enter(&pp->p_zone->zone_nlwps_lock);
|
|
3501 |
pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
|
|
3502 |
pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
|
|
3503 |
mutex_exit(&pp->p_zone->zone_nlwps_lock);
|
|
3504 |
|
|
3505 |
/*
|
|
3506 |
* Joining the zone cannot fail from now on.
|
|
3507 |
*
|
|
3508 |
* This means that a lot of the following code can be commonized and
|
|
3509 |
* shared with zsched().
|
|
3510 |
*/
|
|
3511 |
|
|
3512 |
/*
|
|
3513 |
* Reset the encapsulating process contract's zone.
|
|
3514 |
*/
|
|
3515 |
ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
|
|
3516 |
contract_setzuniqid(ct, zone->zone_uniqid);
|
|
3517 |
|
|
3518 |
/*
|
|
3519 |
* Create a new task and associate the process with the project keyed
|
|
3520 |
* by (projid,zoneid).
|
|
3521 |
*
|
|
3522 |
* We might as well be in project 0; the global zone's projid doesn't
|
|
3523 |
* make much sense in a zone anyhow.
|
|
3524 |
*
|
|
3525 |
* This also increments zone_ntasks, and returns with p_lock held.
|
|
3526 |
*/
|
|
3527 |
tk = task_create(0, zone);
|
|
3528 |
oldtk = task_join(tk, 0);
|
|
3529 |
mutex_exit(&cpu_lock);
|
|
3530 |
|
|
3531 |
pp->p_flag |= SZONETOP;
|
|
3532 |
pp->p_zone = zone;
|
|
3533 |
|
|
3534 |
/*
|
|
3535 |
* call RCTLOP_SET functions on this proc
|
|
3536 |
*/
|
|
3537 |
e.rcep_p.zone = zone;
|
|
3538 |
e.rcep_t = RCENTITY_ZONE;
|
|
3539 |
(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
|
|
3540 |
RCD_CALLBACK);
|
|
3541 |
mutex_exit(&pp->p_lock);
|
|
3542 |
|
|
3543 |
/*
|
|
3544 |
* We don't need to hold any of zsched's locks here; not only do we know
|
|
3545 |
* the process and zone aren't going away, we know its session isn't
|
|
3546 |
* changing either.
|
|
3547 |
*
|
|
3548 |
* By joining zsched's session here, we mimic the behavior in the
|
|
3549 |
* global zone of init's sid being the pid of sched. We extend this
|
|
3550 |
* to all zlogin-like zone_enter()'ing processes as well.
|
|
3551 |
*/
|
|
3552 |
mutex_enter(&pidlock);
|
|
3553 |
sp = zone->zone_zsched->p_sessp;
|
|
3554 |
SESS_HOLD(sp);
|
|
3555 |
mutex_enter(&pp->p_lock);
|
|
3556 |
pgexit(pp);
|
|
3557 |
SESS_RELE(pp->p_sessp);
|
|
3558 |
pp->p_sessp = sp;
|
|
3559 |
pgjoin(pp, zone->zone_zsched->p_pidp);
|
|
3560 |
mutex_exit(&pp->p_lock);
|
|
3561 |
mutex_exit(&pidlock);
|
|
3562 |
|
|
3563 |
mutex_exit(&zonehash_lock);
|
|
3564 |
/*
|
|
3565 |
* We're firmly in the zone; let pools progress.
|
|
3566 |
*/
|
|
3567 |
pool_unlock();
|
|
3568 |
task_rele(oldtk);
|
|
3569 |
/*
|
|
3570 |
* We don't need to retain a hold on the zone since we already
|
|
3571 |
* incremented zone_ntasks, so the zone isn't going anywhere.
|
|
3572 |
*/
|
|
3573 |
zone_rele(zone);
|
|
3574 |
|
|
3575 |
/*
|
|
3576 |
* Chroot
|
|
3577 |
*/
|
|
3578 |
vp = zone->zone_rootvp;
|
|
3579 |
zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
|
|
3580 |
zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
|
|
3581 |
|
|
3582 |
/*
|
|
3583 |
* Change process credentials
|
|
3584 |
*/
|
|
3585 |
newcr = cralloc();
|
|
3586 |
mutex_enter(&pp->p_crlock);
|
|
3587 |
cr = pp->p_cred;
|
|
3588 |
crcopy_to(cr, newcr);
|
|
3589 |
crsetzone(newcr, zone);
|
|
3590 |
pp->p_cred = newcr;
|
|
3591 |
|
|
3592 |
/*
|
|
3593 |
* Restrict all process privilege sets to zone limit
|
|
3594 |
*/
|
|
3595 |
priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
|
|
3596 |
priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
|
|
3597 |
priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
|
|
3598 |
priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
|
|
3599 |
mutex_exit(&pp->p_crlock);
|
|
3600 |
crset(pp, newcr);
|
|
3601 |
|
|
3602 |
/*
|
|
3603 |
* Adjust upcount to reflect zone entry.
|
|
3604 |
*/
|
|
3605 |
uid = crgetruid(newcr);
|
|
3606 |
mutex_enter(&pidlock);
|
|
3607 |
upcount_dec(uid, GLOBAL_ZONEID);
|
|
3608 |
upcount_inc(uid, zoneid);
|
|
3609 |
mutex_exit(&pidlock);
|
|
3610 |
|
|
3611 |
/*
|
|
3612 |
* Set up core file path and content.
|
|
3613 |
*/
|
|
3614 |
set_core_defaults();
|
|
3615 |
|
|
3616 |
out:
|
|
3617 |
/*
|
|
3618 |
* Let the other lwps continue.
|
|
3619 |
*/
|
|
3620 |
mutex_enter(&pp->p_lock);
|
|
3621 |
if (curthread != pp->p_agenttp)
|
|
3622 |
continuelwps(pp);
|
|
3623 |
mutex_exit(&pp->p_lock);
|
|
3624 |
|
|
3625 |
return (err != 0 ? set_errno(err) : 0);
|
|
3626 |
}
|
|
3627 |
|
|
3628 |
/*
|
|
3629 |
* Systemcall entry point for zone_list(2).
|
|
3630 |
*
|
|
3631 |
* Processes running in a (non-global) zone only see themselves.
|
|
3632 |
*/
|
|
3633 |
static int
|
|
3634 |
zone_list(zoneid_t *zoneidlist, uint_t *numzones)
|
|
3635 |
{
|
|
3636 |
zoneid_t *zoneids;
|
|
3637 |
zone_t *zone;
|
|
3638 |
uint_t user_nzones, real_nzones;
|
|
3639 |
int error = 0;
|
|
3640 |
uint_t i;
|
|
3641 |
|
|
3642 |
if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
|
|
3643 |
return (set_errno(EFAULT));
|
|
3644 |
|
|
3645 |
if (curproc->p_zone != global_zone) {
|
|
3646 |
/* just return current zone */
|
|
3647 |
real_nzones = 1;
|
|
3648 |
zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
|
|
3649 |
zoneids[0] = curproc->p_zone->zone_id;
|
|
3650 |
} else {
|
|
3651 |
mutex_enter(&zonehash_lock);
|
|
3652 |
real_nzones = zonecount;
|
|
3653 |
if (real_nzones) {
|
|
3654 |
zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
|
|
3655 |
KM_SLEEP);
|
|
3656 |
i = 0;
|
|
3657 |
for (zone = list_head(&zone_active); zone != NULL;
|
|
3658 |
zone = list_next(&zone_active, zone))
|
|
3659 |
zoneids[i++] = zone->zone_id;
|
|
3660 |
ASSERT(i == real_nzones);
|
|
3661 |
}
|
|
3662 |
mutex_exit(&zonehash_lock);
|
|
3663 |
}
|
|
3664 |
|
|
3665 |
if (user_nzones > real_nzones)
|
|
3666 |
user_nzones = real_nzones;
|
|
3667 |
|
|
3668 |
if (copyout(&real_nzones, numzones, sizeof (uint_t)) != 0)
|
|
3669 |
error = EFAULT;
|
|
3670 |
else if (zoneidlist != NULL && user_nzones != 0) {
|
|
3671 |
if (copyout(zoneids, zoneidlist,
|
|
3672 |
user_nzones * sizeof (zoneid_t)) != 0)
|
|
3673 |
error = EFAULT;
|
|
3674 |
}
|
|
3675 |
|
|
3676 |
if (real_nzones)
|
|
3677 |
kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
|
|
3678 |
|
|
3679 |
if (error)
|
|
3680 |
return (set_errno(error));
|
|
3681 |
else
|
|
3682 |
return (0);
|
|
3683 |
}
|
|
3684 |
|
|
3685 |
/*
|
|
3686 |
* Systemcall entry point for zone_lookup(2).
|
|
3687 |
*
|
|
3688 |
* Non-global zones are only able to see themselves.
|
|
3689 |
*/
|
|
3690 |
static zoneid_t
|
|
3691 |
zone_lookup(const char *zone_name)
|
|
3692 |
{
|
|
3693 |
char *kname;
|
|
3694 |
zone_t *zone;
|
|
3695 |
zoneid_t zoneid;
|
|
3696 |
int err;
|
|
3697 |
|
|
3698 |
if (zone_name == NULL) {
|
|
3699 |
/* return caller's zone id */
|
|
3700 |
return (getzoneid());
|
|
3701 |
}
|
|
3702 |
|
|
3703 |
kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
|
|
3704 |
if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
|
|
3705 |
kmem_free(kname, ZONENAME_MAX);
|
|
3706 |
return (set_errno(err));
|
|
3707 |
}
|
|
3708 |
|
|
3709 |
mutex_enter(&zonehash_lock);
|
|
3710 |
zone = zone_find_all_by_name(kname);
|
|
3711 |
kmem_free(kname, ZONENAME_MAX);
|
|
3712 |
if (zone == NULL || zone_status_get(zone) < ZONE_IS_READY ||
|
|
3713 |
(curproc->p_zone != global_zone && curproc->p_zone != zone)) {
|
|
3714 |
/* in non-global zone, can only lookup own name */
|
|
3715 |
mutex_exit(&zonehash_lock);
|
|
3716 |
return (set_errno(EINVAL));
|
|
3717 |
}
|
|
3718 |
zoneid = zone->zone_id;
|
|
3719 |
mutex_exit(&zonehash_lock);
|
|
3720 |
return (zoneid);
|
|
3721 |
}
|
|
3722 |
|
|
3723 |
/* ARGSUSED */
|
|
3724 |
long
|
|
3725 |
zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4, void *arg5)
|
|
3726 |
{
|
|
3727 |
zone_def zs;
|
|
3728 |
|
|
3729 |
switch (cmd) {
|
|
3730 |
case ZONE_CREATE:
|
|
3731 |
if (get_udatamodel() == DATAMODEL_NATIVE) {
|
|
3732 |
if (copyin(arg1, &zs, sizeof (zone_def))) {
|
|
3733 |
return (set_errno(EFAULT));
|
|
3734 |
}
|
|
3735 |
} else {
|
|
3736 |
#ifdef _SYSCALL32_IMPL
|
|
3737 |
zone_def32 zs32;
|
|
3738 |
|
|
3739 |
if (copyin(arg1, &zs32, sizeof (zone_def32))) {
|
|
3740 |
return (set_errno(EFAULT));
|
|
3741 |
}
|
|
3742 |
zs.zone_name =
|
|
3743 |
(const char *)(unsigned long)zs32.zone_name;
|
|
3744 |
zs.zone_root =
|
|
3745 |
(const char *)(unsigned long)zs32.zone_root;
|
|
3746 |
zs.zone_privs =
|
|
3747 |
(const struct priv_set *)
|
|
3748 |
(unsigned long)zs32.zone_privs;
|
|
3749 |
zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
|
|
3750 |
zs.rctlbufsz = zs32.rctlbufsz;
|
|
3751 |
zs.extended_error =
|
|
3752 |
(int *)(unsigned long)zs32.extended_error;
|
|
3753 |
#else
|
|
3754 |
panic("get_udatamodel() returned bogus result\n");
|
|
3755 |
#endif
|
|
3756 |
}
|
|
3757 |
|
|
3758 |
return (zone_create(zs.zone_name, zs.zone_root,
|
|
3759 |
zs.zone_privs, (caddr_t)zs.rctlbuf, zs.rctlbufsz,
|
|
3760 |
zs.extended_error));
|
|
3761 |
case ZONE_BOOT:
|
|
3762 |
return (zone_boot((zoneid_t)(uintptr_t)arg1,
|
|
3763 |
(const char *)arg2));
|
|
3764 |
case ZONE_DESTROY:
|
|
3765 |
return (zone_destroy((zoneid_t)(uintptr_t)arg1));
|
|
3766 |
case ZONE_GETATTR:
|
|
3767 |
return (zone_getattr((zoneid_t)(uintptr_t)arg1,
|
|
3768 |
(int)(uintptr_t)arg2, arg3, (size_t)arg4));
|
|
3769 |
case ZONE_ENTER:
|
|
3770 |
return (zone_enter((zoneid_t)(uintptr_t)arg1));
|
|
3771 |
case ZONE_LIST:
|
|
3772 |
return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
|
|
3773 |
case ZONE_SHUTDOWN:
|
|
3774 |
return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
|
|
3775 |
case ZONE_LOOKUP:
|
|
3776 |
return (zone_lookup((const char *)arg1));
|
|
3777 |
default:
|
|
3778 |
return (set_errno(EINVAL));
|
|
3779 |
}
|
|
3780 |
}
|
|
3781 |
|
|
3782 |
struct zarg {
|
|
3783 |
zone_t *zone;
|
|
3784 |
zone_cmd_arg_t arg;
|
|
3785 |
};
|
|
3786 |
|
|
3787 |
static int
|
|
3788 |
zone_lookup_door(const char *zone_name, door_handle_t *doorp)
|
|
3789 |
{
|
|
3790 |
char *buf;
|
|
3791 |
size_t buflen;
|
|
3792 |
int error;
|
|
3793 |
|
|
3794 |
buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
|
|
3795 |
buf = kmem_alloc(buflen, KM_SLEEP);
|
|
3796 |
(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
|
|
3797 |
error = door_ki_open(buf, doorp);
|
|
3798 |
kmem_free(buf, buflen);
|
|
3799 |
return (error);
|
|
3800 |
}
|
|
3801 |
|
|
3802 |
static void
|
|
3803 |
zone_release_door(door_handle_t *doorp)
|
|
3804 |
{
|
|
3805 |
door_ki_rele(*doorp);
|
|
3806 |
*doorp = NULL;
|
|
3807 |
}
|
|
3808 |
|
|
3809 |
static void
|
|
3810 |
zone_ki_call_zoneadmd(struct zarg *zargp)
|
|
3811 |
{
|
|
3812 |
door_handle_t door = NULL;
|
|
3813 |
door_arg_t darg, save_arg;
|
|
3814 |
char *zone_name;
|
|
3815 |
size_t zone_namelen;
|
|
3816 |
zoneid_t zoneid;
|
|
3817 |
zone_t *zone;
|
|
3818 |
zone_cmd_arg_t arg;
|
|
3819 |
uint64_t uniqid;
|
|
3820 |
size_t size;
|
|
3821 |
int error;
|
|
3822 |
int retry;
|
|
3823 |
|
|
3824 |
zone = zargp->zone;
|
|
3825 |
arg = zargp->arg;
|
|
3826 |
kmem_free(zargp, sizeof (*zargp));
|
|
3827 |
|
|
3828 |
zone_namelen = strlen(zone->zone_name) + 1;
|
|
3829 |
zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
|
|
3830 |
bcopy(zone->zone_name, zone_name, zone_namelen);
|
|
3831 |
zoneid = zone->zone_id;
|
|
3832 |
uniqid = zone->zone_uniqid;
|
|
3833 |
/*
|
|
3834 |
* zoneadmd may be down, but at least we can empty out the zone.
|
|
3835 |
* We can ignore the return value of zone_empty() since we're called
|
|
3836 |
* from a kernel thread and know we won't be delivered any signals.
|
|
3837 |
*/
|
|
3838 |
ASSERT(curproc == &p0);
|
|
3839 |
(void) zone_empty(zone);
|
|
3840 |
ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
|
|
3841 |
zone_rele(zone);
|
|
3842 |
|
|
3843 |
size = sizeof (arg);
|
|
3844 |
darg.rbuf = (char *)&arg;
|
|
3845 |
darg.data_ptr = (char *)&arg;
|
|
3846 |
darg.rsize = size;
|
|
3847 |
darg.data_size = size;
|
|
3848 |
darg.desc_ptr = NULL;
|
|
3849 |
darg.desc_num = 0;
|
|
3850 |
|
|
3851 |
save_arg = darg;
|
|
3852 |
/*
|
|
3853 |
* Since we're not holding a reference to the zone, any number of
|
|
3854 |
* things can go wrong, including the zone disappearing before we get a
|
|
3855 |
* chance to talk to zoneadmd.
|
|
3856 |
*/
|
|
3857 |
for (retry = 0; /* forever */; retry++) {
|
|
3858 |
if (door == NULL &&
|
|
3859 |
(error = zone_lookup_door(zone_name, &door)) != 0) {
|
|
3860 |
goto next;
|
|
3861 |
}
|
|
3862 |
ASSERT(door != NULL);
|
|
3863 |
|
|
3864 |
if ((error = door_ki_upcall(door, &darg)) == 0) {
|
|
3865 |
break;
|
|
3866 |
}
|
|
3867 |
switch (error) {
|
|
3868 |
case EINTR:
|
|
3869 |
/* FALLTHROUGH */
|
|
3870 |
case EAGAIN: /* process may be forking */
|
|
3871 |
/*
|
|
3872 |
* Back off for a bit
|
|
3873 |
*/
|
|
3874 |
break;
|
|
3875 |
case EBADF:
|
|
3876 |
zone_release_door(&door);
|
|
3877 |
if (zone_lookup_door(zone_name, &door) != 0) {
|
|
3878 |
/*
|
|
3879 |
* zoneadmd may be dead, but it may come back to
|
|
3880 |
* life later.
|
|
3881 |
*/
|
|
3882 |
break;
|
|
3883 |
}
|
|
3884 |
break;
|
|
3885 |
default:
|
|
3886 |
cmn_err(CE_WARN,
|
|
3887 |
"zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
|
|
3888 |
error);
|
|
3889 |
goto out;
|
|
3890 |
}
|
|
3891 |
next:
|
|
3892 |
/*
|
|
3893 |
* If this isn't the same zone_t that we originally had in mind,
|
|
3894 |
* then this is the same as if two kadmin requests come in at
|
|
3895 |
* the same time: the first one wins. This means we lose, so we
|
|
3896 |
* bail.
|
|
3897 |
*/
|
|
3898 |
if ((zone = zone_find_by_id(zoneid)) == NULL) {
|
|
3899 |
/*
|
|
3900 |
* Problem is solved.
|
|
3901 |
*/
|
|
3902 |
break;
|
|
3903 |
}
|
|
3904 |
if (zone->zone_uniqid != uniqid) {
|
|
3905 |
/*
|
|
3906 |
* zoneid recycled
|
|
3907 |
*/
|
|
3908 |
zone_rele(zone);
|
|
3909 |
break;
|
|
3910 |
}
|
|
3911 |
/*
|
|
3912 |
* We could zone_status_timedwait(), but there doesn't seem to
|
|
3913 |
* be much point in doing that (plus, it would mean that
|
|
3914 |
* zone_free() isn't called until this thread exits).
|
|
3915 |
*/
|
|
3916 |
zone_rele(zone);
|
|
3917 |
delay(hz);
|
|
3918 |
darg = save_arg;
|
|
3919 |
}
|
|
3920 |
out:
|
|
3921 |
if (door != NULL) {
|
|
3922 |
zone_release_door(&door);
|
|
3923 |
}
|
|
3924 |
kmem_free(zone_name, zone_namelen);
|
|
3925 |
thread_exit();
|
|
3926 |
}
|
|
3927 |
|
|
3928 |
/*
|
|
3929 |
* Entry point for uadmin() to tell the zone to go away or reboot. The caller
|
|
3930 |
* is a process in the zone to be modified.
|
|
3931 |
*
|
|
3932 |
* In order to shutdown the zone, we will hand off control to zoneadmd
|
|
3933 |
* (running in the global zone) via a door. We do a half-hearted job at
|
|
3934 |
* killing all processes in the zone, create a kernel thread to contact
|
|
3935 |
* zoneadmd, and make note of the "uniqid" of the zone. The uniqid is
|
|
3936 |
* a form of generation number used to let zoneadmd (as well as
|
|
3937 |
* zone_destroy()) know exactly which zone they're re talking about.
|
|
3938 |
*/
|
|
3939 |
int
|
|
3940 |
zone_uadmin(int cmd, int fcn, cred_t *credp)
|
|
3941 |
{
|
|
3942 |
struct zarg *zargp;
|
|
3943 |
zone_cmd_t zcmd;
|
|
3944 |
zone_t *zone;
|
|
3945 |
|
|
3946 |
zone = curproc->p_zone;
|
|
3947 |
ASSERT(getzoneid() != GLOBAL_ZONEID);
|
|
3948 |
|
|
3949 |
switch (cmd) {
|
|
3950 |
case A_SHUTDOWN:
|
|
3951 |
switch (fcn) {
|
|
3952 |
case AD_HALT:
|
|
3953 |
case AD_POWEROFF:
|
|
3954 |
zcmd = Z_HALT;
|
|
3955 |
break;
|
|
3956 |
case AD_BOOT:
|
|
3957 |
zcmd = Z_REBOOT;
|
|
3958 |
break;
|
|
3959 |
case AD_IBOOT:
|
|
3960 |
case AD_SBOOT:
|
|
3961 |
case AD_SIBOOT:
|
|
3962 |
case AD_NOSYNC:
|
|
3963 |
return (ENOTSUP);
|
|
3964 |
default:
|
|
3965 |
return (EINVAL);
|
|
3966 |
}
|
|
3967 |
break;
|
|
3968 |
case A_REBOOT:
|
|
3969 |
zcmd = Z_REBOOT;
|
|
3970 |
break;
|
|
3971 |
case A_FTRACE:
|
|
3972 |
case A_REMOUNT:
|
|
3973 |
case A_FREEZE:
|
|
3974 |
case A_DUMP:
|
|
3975 |
return (ENOTSUP);
|
|
3976 |
default:
|
|
3977 |
ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */
|
|
3978 |
return (EINVAL);
|
|
3979 |
}
|
|
3980 |
|
|
3981 |
if (secpolicy_zone_admin(credp, B_FALSE))
|
|
3982 |
return (EPERM);
|
|
3983 |
mutex_enter(&zone_status_lock);
|
|
3984 |
/*
|
|
3985 |
* zone_status can't be ZONE_IS_EMPTY or higher since curproc
|
|
3986 |
* is in the zone.
|
|
3987 |
*/
|
|
3988 |
ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
|
|
3989 |
if (zone_status_get(zone) > ZONE_IS_RUNNING) {
|
|
3990 |
/*
|
|
3991 |
* This zone is already on its way down.
|
|
3992 |
*/
|
|
3993 |
mutex_exit(&zone_status_lock);
|
|
3994 |
return (0);
|
|
3995 |
}
|
|
3996 |
/*
|
|
3997 |
* Prevent future zone_enter()s
|
|
3998 |
*/
|
|
3999 |
zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
|
|
4000 |
mutex_exit(&zone_status_lock);
|
|
4001 |
|
|
4002 |
/*
|
|
4003 |
* Kill everyone now and call zoneadmd later.
|
|
4004 |
* zone_ki_call_zoneadmd() will do a more thorough job of this
|
|
4005 |
* later.
|
|
4006 |
*/
|
|
4007 |
killall(zone->zone_id);
|
|
4008 |
/*
|
|
4009 |
* Now, create the thread to contact zoneadmd and do the rest of the
|
|
4010 |
* work. This thread can't be created in our zone otherwise
|
|
4011 |
* zone_destroy() would deadlock.
|
|
4012 |
*/
|
|
4013 |
zargp = kmem_alloc(sizeof (*zargp), KM_SLEEP);
|
|
4014 |
zargp->arg.cmd = zcmd;
|
|
4015 |
zargp->arg.uniqid = zone->zone_uniqid;
|
|
4016 |
(void) strcpy(zargp->arg.locale, "C");
|
|
4017 |
zone_hold(zargp->zone = zone);
|
|
4018 |
|
|
4019 |
(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
|
|
4020 |
TS_RUN, minclsyspri);
|
|
4021 |
exit(CLD_EXITED, 0);
|
|
4022 |
|
|
4023 |
return (EINVAL);
|
|
4024 |
}
|
|
4025 |
|
|
4026 |
/*
|
|
4027 |
* Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
|
|
4028 |
* status to ZONE_IS_SHUTTING_DOWN.
|
|
4029 |
*/
|
|
4030 |
void
|
|
4031 |
zone_shutdown_global(void)
|
|
4032 |
{
|
|
4033 |
ASSERT(curproc->p_zone == global_zone);
|
|
4034 |
|
|
4035 |
mutex_enter(&zone_status_lock);
|
|
4036 |
ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
|
|
4037 |
zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
|
|
4038 |
mutex_exit(&zone_status_lock);
|
|
4039 |
}
|