usr/src/uts/common/fs/zfs/vdev.c
changeset 789 b348f31ed315
child 1171 b501e9f31cab
equal deleted inserted replaced
788:b90e074d1373 789:b348f31ed315
       
     1 /*
       
     2  * CDDL HEADER START
       
     3  *
       
     4  * The contents of this file are subject to the terms of the
       
     5  * Common Development and Distribution License, Version 1.0 only
       
     6  * (the "License").  You may not use this file except in compliance
       
     7  * with the License.
       
     8  *
       
     9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
       
    10  * or http://www.opensolaris.org/os/licensing.
       
    11  * See the License for the specific language governing permissions
       
    12  * and limitations under the License.
       
    13  *
       
    14  * When distributing Covered Code, include this CDDL HEADER in each
       
    15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
       
    16  * If applicable, add the following below this CDDL HEADER, with the
       
    17  * fields enclosed by brackets "[]" replaced with your own identifying
       
    18  * information: Portions Copyright [yyyy] [name of copyright owner]
       
    19  *
       
    20  * CDDL HEADER END
       
    21  */
       
    22 /*
       
    23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
       
    24  * Use is subject to license terms.
       
    25  */
       
    26 
       
    27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
       
    28 
       
    29 #include <sys/zfs_context.h>
       
    30 #include <sys/spa.h>
       
    31 #include <sys/spa_impl.h>
       
    32 #include <sys/dmu.h>
       
    33 #include <sys/dmu_tx.h>
       
    34 #include <sys/vdev_impl.h>
       
    35 #include <sys/uberblock_impl.h>
       
    36 #include <sys/metaslab.h>
       
    37 #include <sys/metaslab_impl.h>
       
    38 #include <sys/space_map.h>
       
    39 #include <sys/zio.h>
       
    40 #include <sys/zap.h>
       
    41 #include <sys/fs/zfs.h>
       
    42 
       
    43 /*
       
    44  * Virtual device management.
       
    45  */
       
    46 
       
    47 static vdev_ops_t *vdev_ops_table[] = {
       
    48 	&vdev_root_ops,
       
    49 	&vdev_raidz_ops,
       
    50 	&vdev_mirror_ops,
       
    51 	&vdev_replacing_ops,
       
    52 	&vdev_disk_ops,
       
    53 	&vdev_file_ops,
       
    54 	&vdev_missing_ops,
       
    55 	NULL
       
    56 };
       
    57 
       
    58 /*
       
    59  * Given a vdev type, return the appropriate ops vector.
       
    60  */
       
    61 static vdev_ops_t *
       
    62 vdev_getops(const char *type)
       
    63 {
       
    64 	vdev_ops_t *ops, **opspp;
       
    65 
       
    66 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
       
    67 		if (strcmp(ops->vdev_op_type, type) == 0)
       
    68 			break;
       
    69 
       
    70 	return (ops);
       
    71 }
       
    72 
       
    73 /*
       
    74  * Default asize function: return the MAX of psize with the asize of
       
    75  * all children.  This is what's used by anything other than RAID-Z.
       
    76  */
       
    77 uint64_t
       
    78 vdev_default_asize(vdev_t *vd, uint64_t psize)
       
    79 {
       
    80 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
       
    81 	uint64_t csize;
       
    82 	uint64_t c;
       
    83 
       
    84 	for (c = 0; c < vd->vdev_children; c++) {
       
    85 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
       
    86 		asize = MAX(asize, csize);
       
    87 	}
       
    88 
       
    89 	return (asize);
       
    90 }
       
    91 
       
    92 vdev_t *
       
    93 vdev_lookup_top(spa_t *spa, uint64_t vdev)
       
    94 {
       
    95 	vdev_t *rvd = spa->spa_root_vdev;
       
    96 
       
    97 	if (vdev < rvd->vdev_children)
       
    98 		return (rvd->vdev_child[vdev]);
       
    99 
       
   100 	return (NULL);
       
   101 }
       
   102 
       
   103 vdev_t *
       
   104 vdev_lookup_by_path(vdev_t *vd, const char *path)
       
   105 {
       
   106 	int c;
       
   107 	vdev_t *mvd;
       
   108 
       
   109 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
       
   110 		return (vd);
       
   111 
       
   112 	for (c = 0; c < vd->vdev_children; c++)
       
   113 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
       
   114 		    NULL)
       
   115 			return (mvd);
       
   116 
       
   117 	return (NULL);
       
   118 }
       
   119 
       
   120 vdev_t *
       
   121 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
       
   122 {
       
   123 	int c;
       
   124 	vdev_t *mvd;
       
   125 
       
   126 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
       
   127 		return (vd);
       
   128 
       
   129 	for (c = 0; c < vd->vdev_children; c++)
       
   130 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
       
   131 		    NULL)
       
   132 			return (mvd);
       
   133 
       
   134 	return (NULL);
       
   135 }
       
   136 
       
   137 void
       
   138 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
       
   139 {
       
   140 	size_t oldsize, newsize;
       
   141 	uint64_t id = cvd->vdev_id;
       
   142 	vdev_t **newchild;
       
   143 
       
   144 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
       
   145 	ASSERT(cvd->vdev_parent == NULL);
       
   146 
       
   147 	cvd->vdev_parent = pvd;
       
   148 
       
   149 	if (pvd == NULL)
       
   150 		return;
       
   151 
       
   152 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
       
   153 
       
   154 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
       
   155 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
       
   156 	newsize = pvd->vdev_children * sizeof (vdev_t *);
       
   157 
       
   158 	newchild = kmem_zalloc(newsize, KM_SLEEP);
       
   159 	if (pvd->vdev_child != NULL) {
       
   160 		bcopy(pvd->vdev_child, newchild, oldsize);
       
   161 		kmem_free(pvd->vdev_child, oldsize);
       
   162 	}
       
   163 
       
   164 	pvd->vdev_child = newchild;
       
   165 	pvd->vdev_child[id] = cvd;
       
   166 
       
   167 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
       
   168 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
       
   169 
       
   170 	/*
       
   171 	 * Walk up all ancestors to update guid sum.
       
   172 	 */
       
   173 	for (; pvd != NULL; pvd = pvd->vdev_parent)
       
   174 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
       
   175 }
       
   176 
       
   177 void
       
   178 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
       
   179 {
       
   180 	int c;
       
   181 	uint_t id = cvd->vdev_id;
       
   182 
       
   183 	ASSERT(cvd->vdev_parent == pvd);
       
   184 
       
   185 	if (pvd == NULL)
       
   186 		return;
       
   187 
       
   188 	ASSERT(id < pvd->vdev_children);
       
   189 	ASSERT(pvd->vdev_child[id] == cvd);
       
   190 
       
   191 	pvd->vdev_child[id] = NULL;
       
   192 	cvd->vdev_parent = NULL;
       
   193 
       
   194 	for (c = 0; c < pvd->vdev_children; c++)
       
   195 		if (pvd->vdev_child[c])
       
   196 			break;
       
   197 
       
   198 	if (c == pvd->vdev_children) {
       
   199 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
       
   200 		pvd->vdev_child = NULL;
       
   201 		pvd->vdev_children = 0;
       
   202 	}
       
   203 
       
   204 	/*
       
   205 	 * Walk up all ancestors to update guid sum.
       
   206 	 */
       
   207 	for (; pvd != NULL; pvd = pvd->vdev_parent)
       
   208 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
       
   209 }
       
   210 
       
   211 /*
       
   212  * Remove any holes in the child array.
       
   213  */
       
   214 void
       
   215 vdev_compact_children(vdev_t *pvd)
       
   216 {
       
   217 	vdev_t **newchild, *cvd;
       
   218 	int oldc = pvd->vdev_children;
       
   219 	int newc, c;
       
   220 
       
   221 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
       
   222 
       
   223 	for (c = newc = 0; c < oldc; c++)
       
   224 		if (pvd->vdev_child[c])
       
   225 			newc++;
       
   226 
       
   227 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
       
   228 
       
   229 	for (c = newc = 0; c < oldc; c++) {
       
   230 		if ((cvd = pvd->vdev_child[c]) != NULL) {
       
   231 			newchild[newc] = cvd;
       
   232 			cvd->vdev_id = newc++;
       
   233 		}
       
   234 	}
       
   235 
       
   236 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
       
   237 	pvd->vdev_child = newchild;
       
   238 	pvd->vdev_children = newc;
       
   239 }
       
   240 
       
   241 /*
       
   242  * Allocate and minimally initialize a vdev_t.
       
   243  */
       
   244 static vdev_t *
       
   245 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
       
   246 {
       
   247 	vdev_t *vd;
       
   248 
       
   249 	while (guid == 0)
       
   250 		guid = spa_get_random(-1ULL);
       
   251 
       
   252 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
       
   253 
       
   254 	vd->vdev_spa = spa;
       
   255 	vd->vdev_id = id;
       
   256 	vd->vdev_guid = guid;
       
   257 	vd->vdev_guid_sum = guid;
       
   258 	vd->vdev_ops = ops;
       
   259 	vd->vdev_state = VDEV_STATE_CLOSED;
       
   260 
       
   261 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
       
   262 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
       
   263 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
       
   264 	    offsetof(zio_t, io_pending));
       
   265 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
       
   266 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
       
   267 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
       
   268 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
       
   269 	txg_list_create(&vd->vdev_ms_list,
       
   270 	    offsetof(struct metaslab, ms_txg_node));
       
   271 	txg_list_create(&vd->vdev_dtl_list,
       
   272 	    offsetof(struct vdev, vdev_dtl_node));
       
   273 	vd->vdev_stat.vs_timestamp = gethrtime();
       
   274 
       
   275 	return (vd);
       
   276 }
       
   277 
       
   278 /*
       
   279  * Free a vdev_t that has been removed from service.
       
   280  */
       
   281 static void
       
   282 vdev_free_common(vdev_t *vd)
       
   283 {
       
   284 	if (vd->vdev_path)
       
   285 		spa_strfree(vd->vdev_path);
       
   286 	if (vd->vdev_devid)
       
   287 		spa_strfree(vd->vdev_devid);
       
   288 
       
   289 	txg_list_destroy(&vd->vdev_ms_list);
       
   290 	txg_list_destroy(&vd->vdev_dtl_list);
       
   291 	mutex_enter(&vd->vdev_dtl_lock);
       
   292 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
       
   293 	space_map_destroy(&vd->vdev_dtl_map);
       
   294 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
       
   295 	space_map_destroy(&vd->vdev_dtl_scrub);
       
   296 	mutex_exit(&vd->vdev_dtl_lock);
       
   297 	mutex_destroy(&vd->vdev_dtl_lock);
       
   298 	mutex_destroy(&vd->vdev_dirty_lock);
       
   299 	list_destroy(&vd->vdev_io_pending);
       
   300 	mutex_destroy(&vd->vdev_io_lock);
       
   301 	cv_destroy(&vd->vdev_io_cv);
       
   302 
       
   303 	kmem_free(vd, sizeof (vdev_t));
       
   304 }
       
   305 
       
   306 /*
       
   307  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
       
   308  * creating a new vdev or loading an existing one - the behavior is slightly
       
   309  * different for each case.
       
   310  */
       
   311 vdev_t *
       
   312 vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
       
   313 {
       
   314 	vdev_ops_t *ops;
       
   315 	char *type;
       
   316 	uint64_t guid = 0;
       
   317 	vdev_t *vd;
       
   318 
       
   319 	ASSERT(spa_config_held(spa, RW_WRITER));
       
   320 
       
   321 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
       
   322 		return (NULL);
       
   323 
       
   324 	if ((ops = vdev_getops(type)) == NULL)
       
   325 		return (NULL);
       
   326 
       
   327 	/*
       
   328 	 * If this is a load, get the vdev guid from the nvlist.
       
   329 	 * Otherwise, vdev_alloc_common() will generate one for us.
       
   330 	 */
       
   331 	if (alloctype == VDEV_ALLOC_LOAD) {
       
   332 		uint64_t label_id;
       
   333 
       
   334 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
       
   335 		    label_id != id)
       
   336 			return (NULL);
       
   337 
       
   338 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
       
   339 			return (NULL);
       
   340 	}
       
   341 
       
   342 	vd = vdev_alloc_common(spa, id, guid, ops);
       
   343 
       
   344 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
       
   345 		vd->vdev_path = spa_strdup(vd->vdev_path);
       
   346 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
       
   347 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
       
   348 
       
   349 	/*
       
   350 	 * If we're a top-level vdev, try to load the allocation parameters.
       
   351 	 */
       
   352 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
       
   353 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
       
   354 		    &vd->vdev_ms_array);
       
   355 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
       
   356 		    &vd->vdev_ms_shift);
       
   357 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
       
   358 		    &vd->vdev_ashift);
       
   359 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
       
   360 		    &vd->vdev_asize);
       
   361 	}
       
   362 
       
   363 	/*
       
   364 	 * If we're a leaf vdev, try to load the DTL object.
       
   365 	 */
       
   366 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
       
   367 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
       
   368 		    &vd->vdev_dtl.smo_object);
       
   369 	}
       
   370 
       
   371 	/*
       
   372 	 * Add ourselves to the parent's list of children.
       
   373 	 */
       
   374 	vdev_add_child(parent, vd);
       
   375 
       
   376 	return (vd);
       
   377 }
       
   378 
       
   379 void
       
   380 vdev_free(vdev_t *vd)
       
   381 {
       
   382 	int c;
       
   383 
       
   384 	/*
       
   385 	 * vdev_free() implies closing the vdev first.  This is simpler than
       
   386 	 * trying to ensure complicated semantics for all callers.
       
   387 	 */
       
   388 	vdev_close(vd);
       
   389 
       
   390 	/*
       
   391 	 * It's possible to free a vdev that's been added to the dirty
       
   392 	 * list when in the middle of spa_vdev_add().  Handle that case
       
   393 	 * correctly here.
       
   394 	 */
       
   395 	if (vd->vdev_is_dirty)
       
   396 		vdev_config_clean(vd);
       
   397 
       
   398 	/*
       
   399 	 * Free all children.
       
   400 	 */
       
   401 	for (c = 0; c < vd->vdev_children; c++)
       
   402 		vdev_free(vd->vdev_child[c]);
       
   403 
       
   404 	ASSERT(vd->vdev_child == NULL);
       
   405 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
       
   406 
       
   407 	/*
       
   408 	 * Discard allocation state.
       
   409 	 */
       
   410 	if (vd == vd->vdev_top)
       
   411 		vdev_metaslab_fini(vd);
       
   412 
       
   413 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
       
   414 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
       
   415 
       
   416 	/*
       
   417 	 * Remove this vdev from its parent's child list.
       
   418 	 */
       
   419 	vdev_remove_child(vd->vdev_parent, vd);
       
   420 
       
   421 	ASSERT(vd->vdev_parent == NULL);
       
   422 
       
   423 	vdev_free_common(vd);
       
   424 }
       
   425 
       
   426 /*
       
   427  * Transfer top-level vdev state from svd to tvd.
       
   428  */
       
   429 static void
       
   430 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
       
   431 {
       
   432 	spa_t *spa = svd->vdev_spa;
       
   433 	metaslab_t *msp;
       
   434 	vdev_t *vd;
       
   435 	int t;
       
   436 
       
   437 	ASSERT(tvd == tvd->vdev_top);
       
   438 
       
   439 	tvd->vdev_ms_array = svd->vdev_ms_array;
       
   440 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
       
   441 	tvd->vdev_ms_count = svd->vdev_ms_count;
       
   442 
       
   443 	svd->vdev_ms_array = 0;
       
   444 	svd->vdev_ms_shift = 0;
       
   445 	svd->vdev_ms_count = 0;
       
   446 
       
   447 	tvd->vdev_mg = svd->vdev_mg;
       
   448 	tvd->vdev_mg->mg_vd = tvd;
       
   449 	tvd->vdev_ms = svd->vdev_ms;
       
   450 	tvd->vdev_smo = svd->vdev_smo;
       
   451 
       
   452 	svd->vdev_mg = NULL;
       
   453 	svd->vdev_ms = NULL;
       
   454 	svd->vdev_smo = NULL;
       
   455 
       
   456 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
       
   457 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
       
   458 
       
   459 	svd->vdev_stat.vs_alloc = 0;
       
   460 	svd->vdev_stat.vs_space = 0;
       
   461 
       
   462 	for (t = 0; t < TXG_SIZE; t++) {
       
   463 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
       
   464 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
       
   465 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
       
   466 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
       
   467 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
       
   468 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
       
   469 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
       
   470 		svd->vdev_dirty[t] = 0;
       
   471 	}
       
   472 
       
   473 	if (svd->vdev_is_dirty) {
       
   474 		vdev_config_clean(svd);
       
   475 		vdev_config_dirty(tvd);
       
   476 	}
       
   477 
       
   478 	ASSERT(svd->vdev_io_retry == NULL);
       
   479 	ASSERT(list_is_empty(&svd->vdev_io_pending));
       
   480 }
       
   481 
       
   482 static void
       
   483 vdev_top_update(vdev_t *tvd, vdev_t *vd)
       
   484 {
       
   485 	int c;
       
   486 
       
   487 	if (vd == NULL)
       
   488 		return;
       
   489 
       
   490 	vd->vdev_top = tvd;
       
   491 
       
   492 	for (c = 0; c < vd->vdev_children; c++)
       
   493 		vdev_top_update(tvd, vd->vdev_child[c]);
       
   494 }
       
   495 
       
   496 /*
       
   497  * Add a mirror/replacing vdev above an existing vdev.
       
   498  */
       
   499 vdev_t *
       
   500 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
       
   501 {
       
   502 	spa_t *spa = cvd->vdev_spa;
       
   503 	vdev_t *pvd = cvd->vdev_parent;
       
   504 	vdev_t *mvd;
       
   505 
       
   506 	ASSERT(spa_config_held(spa, RW_WRITER));
       
   507 
       
   508 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
       
   509 	vdev_remove_child(pvd, cvd);
       
   510 	vdev_add_child(pvd, mvd);
       
   511 	cvd->vdev_id = mvd->vdev_children;
       
   512 	vdev_add_child(mvd, cvd);
       
   513 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
       
   514 
       
   515 	mvd->vdev_asize = cvd->vdev_asize;
       
   516 	mvd->vdev_ashift = cvd->vdev_ashift;
       
   517 	mvd->vdev_state = cvd->vdev_state;
       
   518 
       
   519 	if (mvd == mvd->vdev_top)
       
   520 		vdev_top_transfer(cvd, mvd);
       
   521 
       
   522 	return (mvd);
       
   523 }
       
   524 
       
   525 /*
       
   526  * Remove a 1-way mirror/replacing vdev from the tree.
       
   527  */
       
   528 void
       
   529 vdev_remove_parent(vdev_t *cvd)
       
   530 {
       
   531 	vdev_t *mvd = cvd->vdev_parent;
       
   532 	vdev_t *pvd = mvd->vdev_parent;
       
   533 
       
   534 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
       
   535 
       
   536 	ASSERT(mvd->vdev_children == 1);
       
   537 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
       
   538 	    mvd->vdev_ops == &vdev_replacing_ops);
       
   539 
       
   540 	vdev_remove_child(mvd, cvd);
       
   541 	vdev_remove_child(pvd, mvd);
       
   542 	cvd->vdev_id = mvd->vdev_id;
       
   543 	vdev_add_child(pvd, cvd);
       
   544 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
       
   545 
       
   546 	if (cvd == cvd->vdev_top)
       
   547 		vdev_top_transfer(mvd, cvd);
       
   548 
       
   549 	ASSERT(mvd->vdev_children == 0);
       
   550 	vdev_free(mvd);
       
   551 }
       
   552 
       
   553 void
       
   554 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
       
   555 {
       
   556 	spa_t *spa = vd->vdev_spa;
       
   557 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
       
   558 	uint64_t c;
       
   559 	uint64_t oldc = vd->vdev_ms_count;
       
   560 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
       
   561 	space_map_obj_t *smo = vd->vdev_smo;
       
   562 	metaslab_t **mspp = vd->vdev_ms;
       
   563 
       
   564 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
       
   565 
       
   566 	ASSERT(oldc <= newc);
       
   567 
       
   568 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
       
   569 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
       
   570 	vd->vdev_ms_count = newc;
       
   571 
       
   572 	if (vd->vdev_mg == NULL) {
       
   573 		if (txg == 0) {
       
   574 			dmu_buf_t *db;
       
   575 			uint64_t *ms_array;
       
   576 
       
   577 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
       
   578 			    KM_SLEEP);
       
   579 
       
   580 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
       
   581 			    0, newc * sizeof (uint64_t), ms_array);
       
   582 
       
   583 			for (c = 0; c < newc; c++) {
       
   584 				if (ms_array[c] == 0)
       
   585 					continue;
       
   586 				db = dmu_bonus_hold(spa->spa_meta_objset,
       
   587 				    ms_array[c]);
       
   588 				dmu_buf_read(db);
       
   589 				ASSERT3U(db->db_size, ==, sizeof (*smo));
       
   590 				bcopy(db->db_data, &vd->vdev_smo[c],
       
   591 				    db->db_size);
       
   592 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
       
   593 				    ms_array[c]);
       
   594 				dmu_buf_rele(db);
       
   595 			}
       
   596 			kmem_free(ms_array, newc * sizeof (uint64_t));
       
   597 		}
       
   598 		vd->vdev_mg = metaslab_group_create(mc, vd);
       
   599 	}
       
   600 
       
   601 	for (c = 0; c < oldc; c++) {
       
   602 		vd->vdev_smo[c] = smo[c];
       
   603 		vd->vdev_ms[c] = mspp[c];
       
   604 		mspp[c]->ms_smo = &vd->vdev_smo[c];
       
   605 	}
       
   606 
       
   607 	for (c = oldc; c < newc; c++)
       
   608 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
       
   609 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
       
   610 
       
   611 	if (oldc != 0) {
       
   612 		kmem_free(smo, oldc * sizeof (*smo));
       
   613 		kmem_free(mspp, oldc * sizeof (*mspp));
       
   614 	}
       
   615 
       
   616 }
       
   617 
       
   618 void
       
   619 vdev_metaslab_fini(vdev_t *vd)
       
   620 {
       
   621 	uint64_t m;
       
   622 	uint64_t count = vd->vdev_ms_count;
       
   623 
       
   624 	if (vd->vdev_ms != NULL) {
       
   625 		for (m = 0; m < count; m++)
       
   626 			metaslab_fini(vd->vdev_ms[m]);
       
   627 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
       
   628 		vd->vdev_ms = NULL;
       
   629 	}
       
   630 
       
   631 	if (vd->vdev_smo != NULL) {
       
   632 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
       
   633 		vd->vdev_smo = NULL;
       
   634 	}
       
   635 }
       
   636 
       
   637 /*
       
   638  * Prepare a virtual device for access.
       
   639  */
       
   640 int
       
   641 vdev_open(vdev_t *vd)
       
   642 {
       
   643 	int error;
       
   644 	vdev_knob_t *vk;
       
   645 	int c;
       
   646 	uint64_t osize = 0;
       
   647 	uint64_t asize, psize;
       
   648 	uint64_t ashift = -1ULL;
       
   649 
       
   650 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
       
   651 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
       
   652 	    vd->vdev_state == VDEV_STATE_OFFLINE);
       
   653 
       
   654 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
       
   655 		vd->vdev_fault_arg >>= 1;
       
   656 	else
       
   657 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
       
   658 
       
   659 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
       
   660 
       
   661 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
       
   662 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
       
   663 
       
   664 		*valp = vk->vk_default;
       
   665 		*valp = MAX(*valp, vk->vk_min);
       
   666 		*valp = MIN(*valp, vk->vk_max);
       
   667 	}
       
   668 
       
   669 	if (vd->vdev_ops->vdev_op_leaf) {
       
   670 		vdev_cache_init(vd);
       
   671 		vdev_queue_init(vd);
       
   672 		vd->vdev_cache_active = B_TRUE;
       
   673 	}
       
   674 
       
   675 	if (vd->vdev_offline) {
       
   676 		ASSERT(vd->vdev_children == 0);
       
   677 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
       
   678 		vd->vdev_state = VDEV_STATE_OFFLINE;
       
   679 		return (ENXIO);
       
   680 	}
       
   681 
       
   682 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
       
   683 
       
   684 	dprintf("%s = %d, osize %llu, state = %d\n",
       
   685 	    vdev_description(vd), error, osize, vd->vdev_state);
       
   686 
       
   687 	if (error) {
       
   688 		dprintf("%s in %s failed to open, error %d, aux %d\n",
       
   689 		    vdev_description(vd),
       
   690 		    vdev_description(vd->vdev_parent),
       
   691 		    error,
       
   692 		    vd->vdev_stat.vs_aux);
       
   693 
       
   694 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
       
   695 		return (error);
       
   696 	}
       
   697 
       
   698 	vd->vdev_state = VDEV_STATE_HEALTHY;
       
   699 
       
   700 	for (c = 0; c < vd->vdev_children; c++)
       
   701 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
       
   702 			vd->vdev_state = VDEV_STATE_DEGRADED;
       
   703 
       
   704 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
       
   705 
       
   706 	if (vd->vdev_children == 0) {
       
   707 		if (osize < SPA_MINDEVSIZE) {
       
   708 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
       
   709 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
       
   710 			return (EOVERFLOW);
       
   711 		}
       
   712 		psize = osize;
       
   713 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
       
   714 	} else {
       
   715 		if (osize < SPA_MINDEVSIZE -
       
   716 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
       
   717 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
       
   718 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
       
   719 			return (EOVERFLOW);
       
   720 		}
       
   721 		psize = 0;
       
   722 		asize = osize;
       
   723 	}
       
   724 
       
   725 	vd->vdev_psize = psize;
       
   726 
       
   727 	if (vd->vdev_asize == 0) {
       
   728 		/*
       
   729 		 * This is the first-ever open, so use the computed values.
       
   730 		 */
       
   731 		vd->vdev_asize = asize;
       
   732 		vd->vdev_ashift = ashift;
       
   733 	} else {
       
   734 		/*
       
   735 		 * Make sure the alignment requirement hasn't increased.
       
   736 		 */
       
   737 		if (ashift > vd->vdev_ashift) {
       
   738 			dprintf("%s: ashift grew\n", vdev_description(vd));
       
   739 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
       
   740 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
       
   741 			return (EINVAL);
       
   742 		}
       
   743 
       
   744 		/*
       
   745 		 * Make sure the device hasn't shrunk.
       
   746 		 */
       
   747 		if (asize < vd->vdev_asize) {
       
   748 			dprintf("%s: device shrank\n", vdev_description(vd));
       
   749 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
       
   750 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
       
   751 			return (EINVAL);
       
   752 		}
       
   753 
       
   754 		/*
       
   755 		 * If all children are healthy and the asize has increased,
       
   756 		 * then we've experienced dynamic LUN growth.
       
   757 		 */
       
   758 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
       
   759 		    asize > vd->vdev_asize) {
       
   760 			dprintf("%s: device grew\n", vdev_description(vd));
       
   761 			vd->vdev_asize = asize;
       
   762 		}
       
   763 	}
       
   764 
       
   765 	return (0);
       
   766 }
       
   767 
       
   768 /*
       
   769  * Close a virtual device.
       
   770  */
       
   771 void
       
   772 vdev_close(vdev_t *vd)
       
   773 {
       
   774 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
       
   775 
       
   776 	vd->vdev_ops->vdev_op_close(vd);
       
   777 
       
   778 	if (vd->vdev_cache_active) {
       
   779 		vdev_cache_fini(vd);
       
   780 		vdev_queue_fini(vd);
       
   781 		vd->vdev_cache_active = B_FALSE;
       
   782 	}
       
   783 
       
   784 	if (vd->vdev_offline)
       
   785 		vd->vdev_state = VDEV_STATE_OFFLINE;
       
   786 	else
       
   787 		vd->vdev_state = VDEV_STATE_CLOSED;
       
   788 }
       
   789 
       
   790 void
       
   791 vdev_reopen(vdev_t *vd, zio_t **rq)
       
   792 {
       
   793 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
       
   794 	int c;
       
   795 
       
   796 	if (vd == rvd) {
       
   797 		ASSERT(rq == NULL);
       
   798 		for (c = 0; c < rvd->vdev_children; c++)
       
   799 			vdev_reopen(rvd->vdev_child[c], NULL);
       
   800 		return;
       
   801 	}
       
   802 
       
   803 	/* only valid for top-level vdevs */
       
   804 	ASSERT3P(vd, ==, vd->vdev_top);
       
   805 
       
   806 	/*
       
   807 	 * vdev_state can change when spa_config_lock is held as writer,
       
   808 	 * or when it's held as reader and we're doing a vdev_reopen().
       
   809 	 * To handle the latter case, we grab rvd's io_lock to serialize
       
   810 	 * reopens.  This ensures that there's never more than one vdev
       
   811 	 * state changer active at a time.
       
   812 	 */
       
   813 	mutex_enter(&rvd->vdev_io_lock);
       
   814 
       
   815 	mutex_enter(&vd->vdev_io_lock);
       
   816 	while (list_head(&vd->vdev_io_pending) != NULL)
       
   817 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
       
   818 	vdev_close(vd);
       
   819 	(void) vdev_open(vd);
       
   820 	if (rq != NULL) {
       
   821 		*rq = vd->vdev_io_retry;
       
   822 		vd->vdev_io_retry = NULL;
       
   823 	}
       
   824 	mutex_exit(&vd->vdev_io_lock);
       
   825 
       
   826 	/*
       
   827 	 * Reassess root vdev's health.
       
   828 	 */
       
   829 	rvd->vdev_state = VDEV_STATE_HEALTHY;
       
   830 	for (c = 0; c < rvd->vdev_children; c++) {
       
   831 		uint64_t state = rvd->vdev_child[c]->vdev_state;
       
   832 		rvd->vdev_state = MIN(rvd->vdev_state, state);
       
   833 	}
       
   834 
       
   835 	mutex_exit(&rvd->vdev_io_lock);
       
   836 }
       
   837 
       
   838 int
       
   839 vdev_create(vdev_t *vd, uint64_t txg)
       
   840 {
       
   841 	int error;
       
   842 
       
   843 	/*
       
   844 	 * Normally, partial opens (e.g. of a mirror) are allowed.
       
   845 	 * For a create, however, we want to fail the request if
       
   846 	 * there are any components we can't open.
       
   847 	 */
       
   848 	error = vdev_open(vd);
       
   849 
       
   850 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
       
   851 		vdev_close(vd);
       
   852 		return (error ? error : ENXIO);
       
   853 	}
       
   854 
       
   855 	/*
       
   856 	 * Recursively initialize all labels.
       
   857 	 */
       
   858 	if ((error = vdev_label_init(vd, txg)) != 0) {
       
   859 		vdev_close(vd);
       
   860 		return (error);
       
   861 	}
       
   862 
       
   863 	return (0);
       
   864 }
       
   865 
       
   866 /*
       
   867  * The is the latter half of vdev_create().  It is distinct because it
       
   868  * involves initiating transactions in order to do metaslab creation.
       
   869  * For creation, we want to try to create all vdevs at once and then undo it
       
   870  * if anything fails; this is much harder if we have pending transactions.
       
   871  */
       
   872 void
       
   873 vdev_init(vdev_t *vd, uint64_t txg)
       
   874 {
       
   875 	/*
       
   876 	 * Aim for roughly 200 metaslabs per vdev.
       
   877 	 */
       
   878 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
       
   879 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
       
   880 
       
   881 	/*
       
   882 	 * Initialize the vdev's metaslabs.
       
   883 	 */
       
   884 	vdev_metaslab_init(vd, txg);
       
   885 }
       
   886 
       
   887 void
       
   888 vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
       
   889 {
       
   890 	vdev_t *tvd = vd->vdev_top;
       
   891 
       
   892 	mutex_enter(&tvd->vdev_dirty_lock);
       
   893 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
       
   894 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
       
   895 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
       
   896 		    tvd, txg);
       
   897 	}
       
   898 	mutex_exit(&tvd->vdev_dirty_lock);
       
   899 }
       
   900 
       
   901 void
       
   902 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
       
   903 {
       
   904 	mutex_enter(sm->sm_lock);
       
   905 	if (!space_map_contains(sm, txg, size))
       
   906 		space_map_add(sm, txg, size);
       
   907 	mutex_exit(sm->sm_lock);
       
   908 }
       
   909 
       
   910 int
       
   911 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
       
   912 {
       
   913 	int dirty;
       
   914 
       
   915 	/*
       
   916 	 * Quick test without the lock -- covers the common case that
       
   917 	 * there are no dirty time segments.
       
   918 	 */
       
   919 	if (sm->sm_space == 0)
       
   920 		return (0);
       
   921 
       
   922 	mutex_enter(sm->sm_lock);
       
   923 	dirty = space_map_contains(sm, txg, size);
       
   924 	mutex_exit(sm->sm_lock);
       
   925 
       
   926 	return (dirty);
       
   927 }
       
   928 
       
   929 /*
       
   930  * Reassess DTLs after a config change or scrub completion.
       
   931  */
       
   932 void
       
   933 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
       
   934 {
       
   935 	int c;
       
   936 
       
   937 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
       
   938 
       
   939 	if (vd->vdev_children == 0) {
       
   940 		mutex_enter(&vd->vdev_dtl_lock);
       
   941 		/*
       
   942 		 * We're successfully scrubbed everything up to scrub_txg.
       
   943 		 * Therefore, excise all old DTLs up to that point, then
       
   944 		 * fold in the DTLs for everything we couldn't scrub.
       
   945 		 */
       
   946 		if (scrub_txg != 0) {
       
   947 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
       
   948 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
       
   949 		}
       
   950 		if (scrub_done)
       
   951 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
       
   952 		mutex_exit(&vd->vdev_dtl_lock);
       
   953 		if (txg != 0) {
       
   954 			vdev_t *tvd = vd->vdev_top;
       
   955 			vdev_dirty(tvd, VDD_DTL, txg);
       
   956 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
       
   957 		}
       
   958 		return;
       
   959 	}
       
   960 
       
   961 	mutex_enter(&vd->vdev_dtl_lock);
       
   962 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
       
   963 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
       
   964 	mutex_exit(&vd->vdev_dtl_lock);
       
   965 
       
   966 	for (c = 0; c < vd->vdev_children; c++) {
       
   967 		vdev_t *cvd = vd->vdev_child[c];
       
   968 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
       
   969 		mutex_enter(&vd->vdev_dtl_lock);
       
   970 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
       
   971 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
       
   972 		mutex_exit(&vd->vdev_dtl_lock);
       
   973 	}
       
   974 }
       
   975 
       
   976 static int
       
   977 vdev_dtl_load(vdev_t *vd)
       
   978 {
       
   979 	spa_t *spa = vd->vdev_spa;
       
   980 	space_map_obj_t *smo = &vd->vdev_dtl;
       
   981 	dmu_buf_t *db;
       
   982 	int error;
       
   983 
       
   984 	ASSERT(vd->vdev_children == 0);
       
   985 
       
   986 	if (smo->smo_object == 0)
       
   987 		return (0);
       
   988 
       
   989 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
       
   990 	dmu_buf_read(db);
       
   991 	ASSERT3U(db->db_size, ==, sizeof (*smo));
       
   992 	bcopy(db->db_data, smo, db->db_size);
       
   993 	dmu_buf_rele(db);
       
   994 
       
   995 	mutex_enter(&vd->vdev_dtl_lock);
       
   996 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
       
   997 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
       
   998 	mutex_exit(&vd->vdev_dtl_lock);
       
   999 
       
  1000 	return (error);
       
  1001 }
       
  1002 
       
  1003 void
       
  1004 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
       
  1005 {
       
  1006 	spa_t *spa = vd->vdev_spa;
       
  1007 	space_map_obj_t *smo = &vd->vdev_dtl;
       
  1008 	space_map_t *sm = &vd->vdev_dtl_map;
       
  1009 	space_map_t smsync;
       
  1010 	kmutex_t smlock;
       
  1011 	avl_tree_t *t = &sm->sm_root;
       
  1012 	space_seg_t *ss;
       
  1013 	dmu_buf_t *db;
       
  1014 	dmu_tx_t *tx;
       
  1015 
       
  1016 	dprintf("%s in txg %llu pass %d\n",
       
  1017 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
       
  1018 
       
  1019 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
       
  1020 
       
  1021 	if (vd->vdev_detached) {
       
  1022 		if (smo->smo_object != 0) {
       
  1023 			int err = dmu_object_free(spa->spa_meta_objset,
       
  1024 			    smo->smo_object, tx);
       
  1025 			ASSERT3U(err, ==, 0);
       
  1026 			smo->smo_object = 0;
       
  1027 		}
       
  1028 		dmu_tx_commit(tx);
       
  1029 		return;
       
  1030 	}
       
  1031 
       
  1032 	if (smo->smo_object == 0) {
       
  1033 		ASSERT(smo->smo_objsize == 0);
       
  1034 		ASSERT(smo->smo_alloc == 0);
       
  1035 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
       
  1036 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
       
  1037 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
       
  1038 		ASSERT(smo->smo_object != 0);
       
  1039 		vdev_config_dirty(vd->vdev_top);
       
  1040 	}
       
  1041 
       
  1042 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
       
  1043 	    0, smo->smo_objsize, tx);
       
  1044 
       
  1045 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
       
  1046 
       
  1047 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
       
  1048 	    &smlock);
       
  1049 
       
  1050 	mutex_enter(&smlock);
       
  1051 
       
  1052 	mutex_enter(&vd->vdev_dtl_lock);
       
  1053 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
       
  1054 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
       
  1055 	mutex_exit(&vd->vdev_dtl_lock);
       
  1056 
       
  1057 	smo->smo_objsize = 0;
       
  1058 	smo->smo_alloc = smsync.sm_space;
       
  1059 
       
  1060 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
       
  1061 	space_map_destroy(&smsync);
       
  1062 
       
  1063 	mutex_exit(&smlock);
       
  1064 	mutex_destroy(&smlock);
       
  1065 
       
  1066 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
       
  1067 	dmu_buf_will_dirty(db, tx);
       
  1068 	ASSERT3U(db->db_size, ==, sizeof (*smo));
       
  1069 	bcopy(smo, db->db_data, db->db_size);
       
  1070 	dmu_buf_rele(db);
       
  1071 
       
  1072 	dmu_tx_commit(tx);
       
  1073 }
       
  1074 
       
  1075 int
       
  1076 vdev_load(vdev_t *vd, int import)
       
  1077 {
       
  1078 	spa_t *spa = vd->vdev_spa;
       
  1079 	int c, error;
       
  1080 	nvlist_t *label;
       
  1081 	uint64_t guid, state;
       
  1082 
       
  1083 	dprintf("loading %s\n", vdev_description(vd));
       
  1084 
       
  1085 	/*
       
  1086 	 * Recursively load all children.
       
  1087 	 */
       
  1088 	for (c = 0; c < vd->vdev_children; c++)
       
  1089 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
       
  1090 			return (error);
       
  1091 
       
  1092 	/*
       
  1093 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
       
  1094 	 */
       
  1095 	if (vd->vdev_ops->vdev_op_leaf) {
       
  1096 
       
  1097 		if (vdev_is_dead(vd))
       
  1098 			return (0);
       
  1099 
       
  1100 		/*
       
  1101 		 * XXX state transitions don't propagate to parent here.
       
  1102 		 * Also, merely setting the state isn't sufficient because
       
  1103 		 * it's not persistent; a vdev_reopen() would make us
       
  1104 		 * forget all about it.
       
  1105 		 */
       
  1106 		if ((label = vdev_label_read_config(vd)) == NULL) {
       
  1107 			dprintf("can't load label config\n");
       
  1108 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
       
  1109 			    VDEV_AUX_CORRUPT_DATA);
       
  1110 			return (0);
       
  1111 		}
       
  1112 
       
  1113 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
       
  1114 		    &guid) != 0 || guid != spa_guid(spa)) {
       
  1115 			dprintf("bad or missing pool GUID (%llu)\n", guid);
       
  1116 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
       
  1117 			    VDEV_AUX_CORRUPT_DATA);
       
  1118 			nvlist_free(label);
       
  1119 			return (0);
       
  1120 		}
       
  1121 
       
  1122 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
       
  1123 		    guid != vd->vdev_guid) {
       
  1124 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
       
  1125 			    guid, vd->vdev_guid);
       
  1126 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
       
  1127 			    VDEV_AUX_CORRUPT_DATA);
       
  1128 			nvlist_free(label);
       
  1129 			return (0);
       
  1130 		}
       
  1131 
       
  1132 		/*
       
  1133 		 * If we find a vdev with a matching pool guid and vdev guid,
       
  1134 		 * but the pool state is not active, it indicates that the user
       
  1135 		 * exported or destroyed the pool without affecting the config
       
  1136 		 * cache (if / was mounted readonly, for example).  In this
       
  1137 		 * case, immediately return EBADF so the caller can remove it
       
  1138 		 * from the config.
       
  1139 		 */
       
  1140 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
       
  1141 		    &state)) {
       
  1142 			dprintf("missing pool state\n");
       
  1143 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
       
  1144 			    VDEV_AUX_CORRUPT_DATA);
       
  1145 			nvlist_free(label);
       
  1146 			return (0);
       
  1147 		}
       
  1148 
       
  1149 		if (state != POOL_STATE_ACTIVE &&
       
  1150 		    (!import || state != POOL_STATE_EXPORTED)) {
       
  1151 			dprintf("pool state not active (%llu)\n", state);
       
  1152 			nvlist_free(label);
       
  1153 			return (EBADF);
       
  1154 		}
       
  1155 
       
  1156 		nvlist_free(label);
       
  1157 	}
       
  1158 
       
  1159 	/*
       
  1160 	 * If this is a top-level vdev, make sure its allocation parameters
       
  1161 	 * exist and initialize its metaslabs.
       
  1162 	 */
       
  1163 	if (vd == vd->vdev_top) {
       
  1164 
       
  1165 		if (vd->vdev_ms_array == 0 ||
       
  1166 		    vd->vdev_ms_shift == 0 ||
       
  1167 		    vd->vdev_ashift == 0 ||
       
  1168 		    vd->vdev_asize == 0) {
       
  1169 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
       
  1170 			    VDEV_AUX_CORRUPT_DATA);
       
  1171 			return (0);
       
  1172 		}
       
  1173 
       
  1174 		vdev_metaslab_init(vd, 0);
       
  1175 	}
       
  1176 
       
  1177 	/*
       
  1178 	 * If this is a leaf vdev, load its DTL.
       
  1179 	 */
       
  1180 	if (vd->vdev_ops->vdev_op_leaf) {
       
  1181 		error = vdev_dtl_load(vd);
       
  1182 		if (error) {
       
  1183 			dprintf("can't load DTL for %s, error %d\n",
       
  1184 			    vdev_description(vd), error);
       
  1185 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
       
  1186 			    VDEV_AUX_CORRUPT_DATA);
       
  1187 			return (0);
       
  1188 		}
       
  1189 	}
       
  1190 
       
  1191 	return (0);
       
  1192 }
       
  1193 
       
  1194 void
       
  1195 vdev_sync_done(vdev_t *vd, uint64_t txg)
       
  1196 {
       
  1197 	metaslab_t *msp;
       
  1198 
       
  1199 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
       
  1200 
       
  1201 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
       
  1202 		metaslab_sync_done(msp, txg);
       
  1203 }
       
  1204 
       
  1205 void
       
  1206 vdev_add_sync(vdev_t *vd, uint64_t txg)
       
  1207 {
       
  1208 	spa_t *spa = vd->vdev_spa;
       
  1209 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
       
  1210 
       
  1211 	ASSERT(vd == vd->vdev_top);
       
  1212 
       
  1213 	if (vd->vdev_ms_array == 0)
       
  1214 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
       
  1215 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
       
  1216 
       
  1217 	ASSERT(vd->vdev_ms_array != 0);
       
  1218 
       
  1219 	vdev_config_dirty(vd);
       
  1220 
       
  1221 	dmu_tx_commit(tx);
       
  1222 }
       
  1223 
       
  1224 void
       
  1225 vdev_sync(vdev_t *vd, uint64_t txg)
       
  1226 {
       
  1227 	spa_t *spa = vd->vdev_spa;
       
  1228 	vdev_t *lvd;
       
  1229 	metaslab_t *msp;
       
  1230 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
       
  1231 	uint8_t dirty = *dirtyp;
       
  1232 
       
  1233 	mutex_enter(&vd->vdev_dirty_lock);
       
  1234 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
       
  1235 	mutex_exit(&vd->vdev_dirty_lock);
       
  1236 
       
  1237 	dprintf("%s txg %llu pass %d\n",
       
  1238 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
       
  1239 
       
  1240 	if (dirty & VDD_ADD)
       
  1241 		vdev_add_sync(vd, txg);
       
  1242 
       
  1243 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
       
  1244 		metaslab_sync(msp, txg);
       
  1245 
       
  1246 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
       
  1247 		vdev_dtl_sync(lvd, txg);
       
  1248 
       
  1249 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
       
  1250 }
       
  1251 
       
  1252 uint64_t
       
  1253 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
       
  1254 {
       
  1255 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
       
  1256 }
       
  1257 
       
  1258 void
       
  1259 vdev_io_start(zio_t *zio)
       
  1260 {
       
  1261 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
       
  1262 }
       
  1263 
       
  1264 void
       
  1265 vdev_io_done(zio_t *zio)
       
  1266 {
       
  1267 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
       
  1268 }
       
  1269 
       
  1270 const char *
       
  1271 vdev_description(vdev_t *vd)
       
  1272 {
       
  1273 	if (vd == NULL || vd->vdev_ops == NULL)
       
  1274 		return ("<unknown>");
       
  1275 
       
  1276 	if (vd->vdev_path != NULL)
       
  1277 		return (vd->vdev_path);
       
  1278 
       
  1279 	if (vd->vdev_parent == NULL)
       
  1280 		return (spa_name(vd->vdev_spa));
       
  1281 
       
  1282 	return (vd->vdev_ops->vdev_op_type);
       
  1283 }
       
  1284 
       
  1285 int
       
  1286 vdev_online(spa_t *spa, const char *path)
       
  1287 {
       
  1288 	vdev_t *vd;
       
  1289 
       
  1290 	spa_config_enter(spa, RW_WRITER);
       
  1291 
       
  1292 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
       
  1293 		spa_config_exit(spa);
       
  1294 		return (ENODEV);
       
  1295 	}
       
  1296 
       
  1297 	dprintf("ONLINE: %s\n", vdev_description(vd));
       
  1298 
       
  1299 	vd->vdev_offline = B_FALSE;
       
  1300 
       
  1301 	/*
       
  1302 	 * Clear the error counts.  The idea is that you expect to see all
       
  1303 	 * zeroes when everything is working, so if you've just onlined a
       
  1304 	 * device, you don't want to keep hearing about errors from before.
       
  1305 	 */
       
  1306 	vd->vdev_stat.vs_read_errors = 0;
       
  1307 	vd->vdev_stat.vs_write_errors = 0;
       
  1308 	vd->vdev_stat.vs_checksum_errors = 0;
       
  1309 
       
  1310 	vdev_reopen(vd->vdev_top, NULL);
       
  1311 
       
  1312 	spa_config_exit(spa);
       
  1313 
       
  1314 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
       
  1315 
       
  1316 	return (0);
       
  1317 }
       
  1318 
       
  1319 int
       
  1320 vdev_offline(spa_t *spa, const char *path)
       
  1321 {
       
  1322 	vdev_t *vd;
       
  1323 
       
  1324 	spa_config_enter(spa, RW_WRITER);
       
  1325 
       
  1326 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
       
  1327 		spa_config_exit(spa);
       
  1328 		return (ENODEV);
       
  1329 	}
       
  1330 
       
  1331 	dprintf("OFFLINE: %s\n", vdev_description(vd));
       
  1332 
       
  1333 	/*
       
  1334 	 * If this device's top-level vdev has a non-empty DTL,
       
  1335 	 * don't allow the device to be offlined.
       
  1336 	 *
       
  1337 	 * XXX -- we should make this more precise by allowing the offline
       
  1338 	 * as long as the remaining devices don't have any DTL holes.
       
  1339 	 */
       
  1340 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
       
  1341 		spa_config_exit(spa);
       
  1342 		return (EBUSY);
       
  1343 	}
       
  1344 
       
  1345 	/*
       
  1346 	 * Set this device to offline state and reopen its top-level vdev.
       
  1347 	 * If this action results in the top-level vdev becoming unusable,
       
  1348 	 * undo it and fail the request.
       
  1349 	 */
       
  1350 	vd->vdev_offline = B_TRUE;
       
  1351 	vdev_reopen(vd->vdev_top, NULL);
       
  1352 	if (vdev_is_dead(vd->vdev_top)) {
       
  1353 		vd->vdev_offline = B_FALSE;
       
  1354 		vdev_reopen(vd->vdev_top, NULL);
       
  1355 		spa_config_exit(spa);
       
  1356 		return (EBUSY);
       
  1357 	}
       
  1358 
       
  1359 	spa_config_exit(spa);
       
  1360 
       
  1361 	return (0);
       
  1362 }
       
  1363 
       
  1364 int
       
  1365 vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
       
  1366 {
       
  1367 	vdev_t *vd;
       
  1368 
       
  1369 	spa_config_enter(spa, RW_WRITER);
       
  1370 
       
  1371 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
       
  1372 		spa_config_exit(spa);
       
  1373 		return (ENODEV);
       
  1374 	}
       
  1375 
       
  1376 	vd->vdev_fault_mode = mode;
       
  1377 	vd->vdev_fault_mask = mask;
       
  1378 	vd->vdev_fault_arg = arg;
       
  1379 
       
  1380 	spa_config_exit(spa);
       
  1381 
       
  1382 	return (0);
       
  1383 }
       
  1384 
       
  1385 int
       
  1386 vdev_is_dead(vdev_t *vd)
       
  1387 {
       
  1388 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
       
  1389 }
       
  1390 
       
  1391 int
       
  1392 vdev_error_inject(vdev_t *vd, zio_t *zio)
       
  1393 {
       
  1394 	int error = 0;
       
  1395 
       
  1396 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
       
  1397 		return (0);
       
  1398 
       
  1399 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
       
  1400 		return (0);
       
  1401 
       
  1402 	switch (vd->vdev_fault_mode) {
       
  1403 	case VDEV_FAULT_RANDOM:
       
  1404 		if (spa_get_random(vd->vdev_fault_arg) == 0)
       
  1405 			error = EIO;
       
  1406 		break;
       
  1407 
       
  1408 	case VDEV_FAULT_COUNT:
       
  1409 		if ((int64_t)--vd->vdev_fault_arg <= 0)
       
  1410 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
       
  1411 		error = EIO;
       
  1412 		break;
       
  1413 	}
       
  1414 
       
  1415 	if (error != 0) {
       
  1416 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
       
  1417 		    error, zio->io_type, vdev_description(vd),
       
  1418 		    vd->vdev_state, zio->io_offset);
       
  1419 	}
       
  1420 
       
  1421 	return (error);
       
  1422 }
       
  1423 
       
  1424 /*
       
  1425  * Get statistics for the given vdev.
       
  1426  */
       
  1427 void
       
  1428 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
       
  1429 {
       
  1430 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
       
  1431 	int c, t;
       
  1432 
       
  1433 	mutex_enter(&vd->vdev_stat_lock);
       
  1434 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
       
  1435 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
       
  1436 	vs->vs_state = vd->vdev_state;
       
  1437 	mutex_exit(&vd->vdev_stat_lock);
       
  1438 
       
  1439 	/*
       
  1440 	 * If we're getting stats on the root vdev, aggregate the I/O counts
       
  1441 	 * over all top-level vdevs (i.e. the direct children of the root).
       
  1442 	 */
       
  1443 	if (vd == rvd) {
       
  1444 		for (c = 0; c < rvd->vdev_children; c++) {
       
  1445 			vdev_t *cvd = rvd->vdev_child[c];
       
  1446 			vdev_stat_t *cvs = &cvd->vdev_stat;
       
  1447 
       
  1448 			mutex_enter(&vd->vdev_stat_lock);
       
  1449 			for (t = 0; t < ZIO_TYPES; t++) {
       
  1450 				vs->vs_ops[t] += cvs->vs_ops[t];
       
  1451 				vs->vs_bytes[t] += cvs->vs_bytes[t];
       
  1452 			}
       
  1453 			vs->vs_read_errors += cvs->vs_read_errors;
       
  1454 			vs->vs_write_errors += cvs->vs_write_errors;
       
  1455 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
       
  1456 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
       
  1457 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
       
  1458 			mutex_exit(&vd->vdev_stat_lock);
       
  1459 		}
       
  1460 	}
       
  1461 }
       
  1462 
       
  1463 void
       
  1464 vdev_stat_update(zio_t *zio)
       
  1465 {
       
  1466 	vdev_t *vd = zio->io_vd;
       
  1467 	vdev_t *pvd;
       
  1468 	uint64_t txg = zio->io_txg;
       
  1469 	vdev_stat_t *vs = &vd->vdev_stat;
       
  1470 	zio_type_t type = zio->io_type;
       
  1471 	int flags = zio->io_flags;
       
  1472 
       
  1473 	if (zio->io_error == 0) {
       
  1474 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
       
  1475 			mutex_enter(&vd->vdev_stat_lock);
       
  1476 			vs->vs_ops[type]++;
       
  1477 			vs->vs_bytes[type] += zio->io_size;
       
  1478 			mutex_exit(&vd->vdev_stat_lock);
       
  1479 		}
       
  1480 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
       
  1481 		    zio->io_delegate_list == NULL) {
       
  1482 			mutex_enter(&vd->vdev_stat_lock);
       
  1483 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
       
  1484 				vs->vs_scrub_repaired += zio->io_size;
       
  1485 			else
       
  1486 				vs->vs_self_healed += zio->io_size;
       
  1487 			mutex_exit(&vd->vdev_stat_lock);
       
  1488 		}
       
  1489 		return;
       
  1490 	}
       
  1491 
       
  1492 	if (flags & ZIO_FLAG_SPECULATIVE)
       
  1493 		return;
       
  1494 
       
  1495 	if (!vdev_is_dead(vd)) {
       
  1496 		mutex_enter(&vd->vdev_stat_lock);
       
  1497 		if (type == ZIO_TYPE_READ) {
       
  1498 			if (zio->io_error == ECKSUM)
       
  1499 				vs->vs_checksum_errors++;
       
  1500 			else
       
  1501 				vs->vs_read_errors++;
       
  1502 		}
       
  1503 		if (type == ZIO_TYPE_WRITE)
       
  1504 			vs->vs_write_errors++;
       
  1505 		mutex_exit(&vd->vdev_stat_lock);
       
  1506 	}
       
  1507 
       
  1508 	if (type == ZIO_TYPE_WRITE) {
       
  1509 		if (txg == 0 || vd->vdev_children != 0)
       
  1510 			return;
       
  1511 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
       
  1512 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
       
  1513 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
       
  1514 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
       
  1515 		}
       
  1516 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
       
  1517 			vdev_t *tvd = vd->vdev_top;
       
  1518 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
       
  1519 				return;
       
  1520 			vdev_dirty(tvd, VDD_DTL, txg);
       
  1521 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
       
  1522 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
       
  1523 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
       
  1524 		}
       
  1525 	}
       
  1526 }
       
  1527 
       
  1528 void
       
  1529 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
       
  1530 {
       
  1531 	int c;
       
  1532 	vdev_stat_t *vs = &vd->vdev_stat;
       
  1533 
       
  1534 	for (c = 0; c < vd->vdev_children; c++)
       
  1535 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
       
  1536 
       
  1537 	mutex_enter(&vd->vdev_stat_lock);
       
  1538 
       
  1539 	if (type == POOL_SCRUB_NONE) {
       
  1540 		/*
       
  1541 		 * Update completion and end time.  Leave everything else alone
       
  1542 		 * so we can report what happened during the previous scrub.
       
  1543 		 */
       
  1544 		vs->vs_scrub_complete = complete;
       
  1545 		vs->vs_scrub_end = gethrestime_sec();
       
  1546 	} else {
       
  1547 		vs->vs_scrub_type = type;
       
  1548 		vs->vs_scrub_complete = 0;
       
  1549 		vs->vs_scrub_examined = 0;
       
  1550 		vs->vs_scrub_repaired = 0;
       
  1551 		vs->vs_scrub_errors = 0;
       
  1552 		vs->vs_scrub_start = gethrestime_sec();
       
  1553 		vs->vs_scrub_end = 0;
       
  1554 	}
       
  1555 
       
  1556 	mutex_exit(&vd->vdev_stat_lock);
       
  1557 }
       
  1558 
       
  1559 /*
       
  1560  * Report checksum errors that a vdev that didn't realize it made.
       
  1561  * This can happen, for example, when RAID-Z combinatorial reconstruction
       
  1562  * infers that one of its components returned bad data.
       
  1563  */
       
  1564 void
       
  1565 vdev_checksum_error(zio_t *zio, vdev_t *vd)
       
  1566 {
       
  1567 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
       
  1568 	    vdev_description(vd));
       
  1569 
       
  1570 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
       
  1571 		mutex_enter(&vd->vdev_stat_lock);
       
  1572 		vd->vdev_stat.vs_checksum_errors++;
       
  1573 		mutex_exit(&vd->vdev_stat_lock);
       
  1574 	}
       
  1575 }
       
  1576 
       
  1577 /*
       
  1578  * Update the in-core space usage stats for this vdev and the root vdev.
       
  1579  */
       
  1580 void
       
  1581 vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
       
  1582 {
       
  1583 	ASSERT(vd == vd->vdev_top);
       
  1584 
       
  1585 	do {
       
  1586 		mutex_enter(&vd->vdev_stat_lock);
       
  1587 		vd->vdev_stat.vs_space += space_delta;
       
  1588 		vd->vdev_stat.vs_alloc += alloc_delta;
       
  1589 		mutex_exit(&vd->vdev_stat_lock);
       
  1590 	} while ((vd = vd->vdev_parent) != NULL);
       
  1591 }
       
  1592 
       
  1593 /*
       
  1594  * Various knobs to tune a vdev.
       
  1595  */
       
  1596 static vdev_knob_t vdev_knob[] = {
       
  1597 	{
       
  1598 		"cache_size",
       
  1599 		"size of the read-ahead cache",
       
  1600 		0,
       
  1601 		1ULL << 30,
       
  1602 		10ULL << 20,
       
  1603 		offsetof(struct vdev, vdev_cache.vc_size)
       
  1604 	},
       
  1605 	{
       
  1606 		"cache_bshift",
       
  1607 		"log2 of cache blocksize",
       
  1608 		SPA_MINBLOCKSHIFT,
       
  1609 		SPA_MAXBLOCKSHIFT,
       
  1610 		16,
       
  1611 		offsetof(struct vdev, vdev_cache.vc_bshift)
       
  1612 	},
       
  1613 	{
       
  1614 		"cache_max",
       
  1615 		"largest block size to cache",
       
  1616 		0,
       
  1617 		SPA_MAXBLOCKSIZE,
       
  1618 		1ULL << 14,
       
  1619 		offsetof(struct vdev, vdev_cache.vc_max)
       
  1620 	},
       
  1621 	{
       
  1622 		"min_pending",
       
  1623 		"minimum pending I/Os to the disk",
       
  1624 		1,
       
  1625 		10000,
       
  1626 		2,
       
  1627 		offsetof(struct vdev, vdev_queue.vq_min_pending)
       
  1628 	},
       
  1629 	{
       
  1630 		"max_pending",
       
  1631 		"maximum pending I/Os to the disk",
       
  1632 		1,
       
  1633 		10000,
       
  1634 		35,
       
  1635 		offsetof(struct vdev, vdev_queue.vq_max_pending)
       
  1636 	},
       
  1637 	{
       
  1638 		"agg_limit",
       
  1639 		"maximum size of aggregated I/Os",
       
  1640 		0,
       
  1641 		SPA_MAXBLOCKSIZE,
       
  1642 		SPA_MAXBLOCKSIZE,
       
  1643 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
       
  1644 	},
       
  1645 	{
       
  1646 		"time_shift",
       
  1647 		"deadline = pri + (lbolt >> time_shift)",
       
  1648 		0,
       
  1649 		63,
       
  1650 		4,
       
  1651 		offsetof(struct vdev, vdev_queue.vq_time_shift)
       
  1652 	},
       
  1653 	{
       
  1654 		"ramp_rate",
       
  1655 		"exponential I/O issue ramp-up rate",
       
  1656 		1,
       
  1657 		10000,
       
  1658 		2,
       
  1659 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
       
  1660 	},
       
  1661 };
       
  1662 
       
  1663 vdev_knob_t *
       
  1664 vdev_knob_next(vdev_knob_t *vk)
       
  1665 {
       
  1666 	if (vk == NULL)
       
  1667 		return (vdev_knob);
       
  1668 
       
  1669 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
       
  1670 		return (NULL);
       
  1671 
       
  1672 	return (vk);
       
  1673 }
       
  1674 
       
  1675 /*
       
  1676  * Mark a top-level vdev's config as dirty, placing it on the dirty list
       
  1677  * so that it will be written out next time the vdev configuration is synced.
       
  1678  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
       
  1679  */
       
  1680 void
       
  1681 vdev_config_dirty(vdev_t *vd)
       
  1682 {
       
  1683 	spa_t *spa = vd->vdev_spa;
       
  1684 	vdev_t *rvd = spa->spa_root_vdev;
       
  1685 	int c;
       
  1686 
       
  1687 	if (vd == rvd) {
       
  1688 		for (c = 0; c < rvd->vdev_children; c++)
       
  1689 			vdev_config_dirty(rvd->vdev_child[c]);
       
  1690 	} else {
       
  1691 		ASSERT(vd == vd->vdev_top);
       
  1692 
       
  1693 		if (!vd->vdev_is_dirty) {
       
  1694 			list_insert_head(&spa->spa_dirty_list, vd);
       
  1695 			vd->vdev_is_dirty = B_TRUE;
       
  1696 		}
       
  1697 	}
       
  1698 }
       
  1699 
       
  1700 void
       
  1701 vdev_config_clean(vdev_t *vd)
       
  1702 {
       
  1703 	ASSERT(vd->vdev_is_dirty);
       
  1704 
       
  1705 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
       
  1706 	vd->vdev_is_dirty = B_FALSE;
       
  1707 }
       
  1708 
       
  1709 /*
       
  1710  * Set a vdev's state, updating any parent's state as well.
       
  1711  */
       
  1712 void
       
  1713 vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
       
  1714 {
       
  1715 	if (state == vd->vdev_state)
       
  1716 		return;
       
  1717 
       
  1718 	vd->vdev_state = state;
       
  1719 	vd->vdev_stat.vs_aux = aux;
       
  1720 
       
  1721 	if (vd->vdev_parent != NULL) {
       
  1722 		int c;
       
  1723 		int degraded = 0, faulted = 0;
       
  1724 		vdev_t *parent, *child;
       
  1725 
       
  1726 		parent = vd->vdev_parent;
       
  1727 		for (c = 0; c < parent->vdev_children; c++) {
       
  1728 			child = parent->vdev_child[c];
       
  1729 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
       
  1730 				faulted++;
       
  1731 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
       
  1732 				degraded++;
       
  1733 		}
       
  1734 
       
  1735 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
       
  1736 		    vd->vdev_parent, faulted, degraded);
       
  1737 	    }
       
  1738 }