789
|
1 |
/*
|
|
2 |
* CDDL HEADER START
|
|
3 |
*
|
|
4 |
* The contents of this file are subject to the terms of the
|
|
5 |
* Common Development and Distribution License, Version 1.0 only
|
|
6 |
* (the "License"). You may not use this file except in compliance
|
|
7 |
* with the License.
|
|
8 |
*
|
|
9 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
10 |
* or http://www.opensolaris.org/os/licensing.
|
|
11 |
* See the License for the specific language governing permissions
|
|
12 |
* and limitations under the License.
|
|
13 |
*
|
|
14 |
* When distributing Covered Code, include this CDDL HEADER in each
|
|
15 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
16 |
* If applicable, add the following below this CDDL HEADER, with the
|
|
17 |
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
18 |
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
19 |
*
|
|
20 |
* CDDL HEADER END
|
|
21 |
*/
|
|
22 |
/*
|
|
23 |
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
|
|
24 |
* Use is subject to license terms.
|
|
25 |
*/
|
|
26 |
|
|
27 |
#pragma ident "%Z%%M% %I% %E% SMI"
|
|
28 |
|
|
29 |
#include <sys/zfs_context.h>
|
|
30 |
#include <sys/spa.h>
|
|
31 |
#include <sys/spa_impl.h>
|
|
32 |
#include <sys/dmu.h>
|
|
33 |
#include <sys/dmu_tx.h>
|
|
34 |
#include <sys/vdev_impl.h>
|
|
35 |
#include <sys/uberblock_impl.h>
|
|
36 |
#include <sys/metaslab.h>
|
|
37 |
#include <sys/metaslab_impl.h>
|
|
38 |
#include <sys/space_map.h>
|
|
39 |
#include <sys/zio.h>
|
|
40 |
#include <sys/zap.h>
|
|
41 |
#include <sys/fs/zfs.h>
|
|
42 |
|
|
43 |
/*
|
|
44 |
* Virtual device management.
|
|
45 |
*/
|
|
46 |
|
|
47 |
static vdev_ops_t *vdev_ops_table[] = {
|
|
48 |
&vdev_root_ops,
|
|
49 |
&vdev_raidz_ops,
|
|
50 |
&vdev_mirror_ops,
|
|
51 |
&vdev_replacing_ops,
|
|
52 |
&vdev_disk_ops,
|
|
53 |
&vdev_file_ops,
|
|
54 |
&vdev_missing_ops,
|
|
55 |
NULL
|
|
56 |
};
|
|
57 |
|
|
58 |
/*
|
|
59 |
* Given a vdev type, return the appropriate ops vector.
|
|
60 |
*/
|
|
61 |
static vdev_ops_t *
|
|
62 |
vdev_getops(const char *type)
|
|
63 |
{
|
|
64 |
vdev_ops_t *ops, **opspp;
|
|
65 |
|
|
66 |
for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
|
|
67 |
if (strcmp(ops->vdev_op_type, type) == 0)
|
|
68 |
break;
|
|
69 |
|
|
70 |
return (ops);
|
|
71 |
}
|
|
72 |
|
|
73 |
/*
|
|
74 |
* Default asize function: return the MAX of psize with the asize of
|
|
75 |
* all children. This is what's used by anything other than RAID-Z.
|
|
76 |
*/
|
|
77 |
uint64_t
|
|
78 |
vdev_default_asize(vdev_t *vd, uint64_t psize)
|
|
79 |
{
|
|
80 |
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
|
|
81 |
uint64_t csize;
|
|
82 |
uint64_t c;
|
|
83 |
|
|
84 |
for (c = 0; c < vd->vdev_children; c++) {
|
|
85 |
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
|
|
86 |
asize = MAX(asize, csize);
|
|
87 |
}
|
|
88 |
|
|
89 |
return (asize);
|
|
90 |
}
|
|
91 |
|
|
92 |
vdev_t *
|
|
93 |
vdev_lookup_top(spa_t *spa, uint64_t vdev)
|
|
94 |
{
|
|
95 |
vdev_t *rvd = spa->spa_root_vdev;
|
|
96 |
|
|
97 |
if (vdev < rvd->vdev_children)
|
|
98 |
return (rvd->vdev_child[vdev]);
|
|
99 |
|
|
100 |
return (NULL);
|
|
101 |
}
|
|
102 |
|
|
103 |
vdev_t *
|
|
104 |
vdev_lookup_by_path(vdev_t *vd, const char *path)
|
|
105 |
{
|
|
106 |
int c;
|
|
107 |
vdev_t *mvd;
|
|
108 |
|
|
109 |
if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
|
|
110 |
return (vd);
|
|
111 |
|
|
112 |
for (c = 0; c < vd->vdev_children; c++)
|
|
113 |
if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
|
|
114 |
NULL)
|
|
115 |
return (mvd);
|
|
116 |
|
|
117 |
return (NULL);
|
|
118 |
}
|
|
119 |
|
|
120 |
vdev_t *
|
|
121 |
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
|
|
122 |
{
|
|
123 |
int c;
|
|
124 |
vdev_t *mvd;
|
|
125 |
|
|
126 |
if (vd->vdev_children == 0 && vd->vdev_guid == guid)
|
|
127 |
return (vd);
|
|
128 |
|
|
129 |
for (c = 0; c < vd->vdev_children; c++)
|
|
130 |
if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
|
|
131 |
NULL)
|
|
132 |
return (mvd);
|
|
133 |
|
|
134 |
return (NULL);
|
|
135 |
}
|
|
136 |
|
|
137 |
void
|
|
138 |
vdev_add_child(vdev_t *pvd, vdev_t *cvd)
|
|
139 |
{
|
|
140 |
size_t oldsize, newsize;
|
|
141 |
uint64_t id = cvd->vdev_id;
|
|
142 |
vdev_t **newchild;
|
|
143 |
|
|
144 |
ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
|
|
145 |
ASSERT(cvd->vdev_parent == NULL);
|
|
146 |
|
|
147 |
cvd->vdev_parent = pvd;
|
|
148 |
|
|
149 |
if (pvd == NULL)
|
|
150 |
return;
|
|
151 |
|
|
152 |
ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
|
|
153 |
|
|
154 |
oldsize = pvd->vdev_children * sizeof (vdev_t *);
|
|
155 |
pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
|
|
156 |
newsize = pvd->vdev_children * sizeof (vdev_t *);
|
|
157 |
|
|
158 |
newchild = kmem_zalloc(newsize, KM_SLEEP);
|
|
159 |
if (pvd->vdev_child != NULL) {
|
|
160 |
bcopy(pvd->vdev_child, newchild, oldsize);
|
|
161 |
kmem_free(pvd->vdev_child, oldsize);
|
|
162 |
}
|
|
163 |
|
|
164 |
pvd->vdev_child = newchild;
|
|
165 |
pvd->vdev_child[id] = cvd;
|
|
166 |
|
|
167 |
cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
|
|
168 |
ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
|
|
169 |
|
|
170 |
/*
|
|
171 |
* Walk up all ancestors to update guid sum.
|
|
172 |
*/
|
|
173 |
for (; pvd != NULL; pvd = pvd->vdev_parent)
|
|
174 |
pvd->vdev_guid_sum += cvd->vdev_guid_sum;
|
|
175 |
}
|
|
176 |
|
|
177 |
void
|
|
178 |
vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
|
|
179 |
{
|
|
180 |
int c;
|
|
181 |
uint_t id = cvd->vdev_id;
|
|
182 |
|
|
183 |
ASSERT(cvd->vdev_parent == pvd);
|
|
184 |
|
|
185 |
if (pvd == NULL)
|
|
186 |
return;
|
|
187 |
|
|
188 |
ASSERT(id < pvd->vdev_children);
|
|
189 |
ASSERT(pvd->vdev_child[id] == cvd);
|
|
190 |
|
|
191 |
pvd->vdev_child[id] = NULL;
|
|
192 |
cvd->vdev_parent = NULL;
|
|
193 |
|
|
194 |
for (c = 0; c < pvd->vdev_children; c++)
|
|
195 |
if (pvd->vdev_child[c])
|
|
196 |
break;
|
|
197 |
|
|
198 |
if (c == pvd->vdev_children) {
|
|
199 |
kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
|
|
200 |
pvd->vdev_child = NULL;
|
|
201 |
pvd->vdev_children = 0;
|
|
202 |
}
|
|
203 |
|
|
204 |
/*
|
|
205 |
* Walk up all ancestors to update guid sum.
|
|
206 |
*/
|
|
207 |
for (; pvd != NULL; pvd = pvd->vdev_parent)
|
|
208 |
pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
|
|
209 |
}
|
|
210 |
|
|
211 |
/*
|
|
212 |
* Remove any holes in the child array.
|
|
213 |
*/
|
|
214 |
void
|
|
215 |
vdev_compact_children(vdev_t *pvd)
|
|
216 |
{
|
|
217 |
vdev_t **newchild, *cvd;
|
|
218 |
int oldc = pvd->vdev_children;
|
|
219 |
int newc, c;
|
|
220 |
|
|
221 |
ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
|
|
222 |
|
|
223 |
for (c = newc = 0; c < oldc; c++)
|
|
224 |
if (pvd->vdev_child[c])
|
|
225 |
newc++;
|
|
226 |
|
|
227 |
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
|
|
228 |
|
|
229 |
for (c = newc = 0; c < oldc; c++) {
|
|
230 |
if ((cvd = pvd->vdev_child[c]) != NULL) {
|
|
231 |
newchild[newc] = cvd;
|
|
232 |
cvd->vdev_id = newc++;
|
|
233 |
}
|
|
234 |
}
|
|
235 |
|
|
236 |
kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
|
|
237 |
pvd->vdev_child = newchild;
|
|
238 |
pvd->vdev_children = newc;
|
|
239 |
}
|
|
240 |
|
|
241 |
/*
|
|
242 |
* Allocate and minimally initialize a vdev_t.
|
|
243 |
*/
|
|
244 |
static vdev_t *
|
|
245 |
vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
|
246 |
{
|
|
247 |
vdev_t *vd;
|
|
248 |
|
|
249 |
while (guid == 0)
|
|
250 |
guid = spa_get_random(-1ULL);
|
|
251 |
|
|
252 |
vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
|
|
253 |
|
|
254 |
vd->vdev_spa = spa;
|
|
255 |
vd->vdev_id = id;
|
|
256 |
vd->vdev_guid = guid;
|
|
257 |
vd->vdev_guid_sum = guid;
|
|
258 |
vd->vdev_ops = ops;
|
|
259 |
vd->vdev_state = VDEV_STATE_CLOSED;
|
|
260 |
|
|
261 |
mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
262 |
cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
|
|
263 |
list_create(&vd->vdev_io_pending, sizeof (zio_t),
|
|
264 |
offsetof(zio_t, io_pending));
|
|
265 |
mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
266 |
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
267 |
space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
|
|
268 |
space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
|
|
269 |
txg_list_create(&vd->vdev_ms_list,
|
|
270 |
offsetof(struct metaslab, ms_txg_node));
|
|
271 |
txg_list_create(&vd->vdev_dtl_list,
|
|
272 |
offsetof(struct vdev, vdev_dtl_node));
|
|
273 |
vd->vdev_stat.vs_timestamp = gethrtime();
|
|
274 |
|
|
275 |
return (vd);
|
|
276 |
}
|
|
277 |
|
|
278 |
/*
|
|
279 |
* Free a vdev_t that has been removed from service.
|
|
280 |
*/
|
|
281 |
static void
|
|
282 |
vdev_free_common(vdev_t *vd)
|
|
283 |
{
|
|
284 |
if (vd->vdev_path)
|
|
285 |
spa_strfree(vd->vdev_path);
|
|
286 |
if (vd->vdev_devid)
|
|
287 |
spa_strfree(vd->vdev_devid);
|
|
288 |
|
|
289 |
txg_list_destroy(&vd->vdev_ms_list);
|
|
290 |
txg_list_destroy(&vd->vdev_dtl_list);
|
|
291 |
mutex_enter(&vd->vdev_dtl_lock);
|
|
292 |
space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
|
|
293 |
space_map_destroy(&vd->vdev_dtl_map);
|
|
294 |
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
|
|
295 |
space_map_destroy(&vd->vdev_dtl_scrub);
|
|
296 |
mutex_exit(&vd->vdev_dtl_lock);
|
|
297 |
mutex_destroy(&vd->vdev_dtl_lock);
|
|
298 |
mutex_destroy(&vd->vdev_dirty_lock);
|
|
299 |
list_destroy(&vd->vdev_io_pending);
|
|
300 |
mutex_destroy(&vd->vdev_io_lock);
|
|
301 |
cv_destroy(&vd->vdev_io_cv);
|
|
302 |
|
|
303 |
kmem_free(vd, sizeof (vdev_t));
|
|
304 |
}
|
|
305 |
|
|
306 |
/*
|
|
307 |
* Allocate a new vdev. The 'alloctype' is used to control whether we are
|
|
308 |
* creating a new vdev or loading an existing one - the behavior is slightly
|
|
309 |
* different for each case.
|
|
310 |
*/
|
|
311 |
vdev_t *
|
|
312 |
vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
|
|
313 |
{
|
|
314 |
vdev_ops_t *ops;
|
|
315 |
char *type;
|
|
316 |
uint64_t guid = 0;
|
|
317 |
vdev_t *vd;
|
|
318 |
|
|
319 |
ASSERT(spa_config_held(spa, RW_WRITER));
|
|
320 |
|
|
321 |
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
|
|
322 |
return (NULL);
|
|
323 |
|
|
324 |
if ((ops = vdev_getops(type)) == NULL)
|
|
325 |
return (NULL);
|
|
326 |
|
|
327 |
/*
|
|
328 |
* If this is a load, get the vdev guid from the nvlist.
|
|
329 |
* Otherwise, vdev_alloc_common() will generate one for us.
|
|
330 |
*/
|
|
331 |
if (alloctype == VDEV_ALLOC_LOAD) {
|
|
332 |
uint64_t label_id;
|
|
333 |
|
|
334 |
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
|
|
335 |
label_id != id)
|
|
336 |
return (NULL);
|
|
337 |
|
|
338 |
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
|
|
339 |
return (NULL);
|
|
340 |
}
|
|
341 |
|
|
342 |
vd = vdev_alloc_common(spa, id, guid, ops);
|
|
343 |
|
|
344 |
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
|
|
345 |
vd->vdev_path = spa_strdup(vd->vdev_path);
|
|
346 |
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
|
|
347 |
vd->vdev_devid = spa_strdup(vd->vdev_devid);
|
|
348 |
|
|
349 |
/*
|
|
350 |
* If we're a top-level vdev, try to load the allocation parameters.
|
|
351 |
*/
|
|
352 |
if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
|
|
353 |
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
|
|
354 |
&vd->vdev_ms_array);
|
|
355 |
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
|
|
356 |
&vd->vdev_ms_shift);
|
|
357 |
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
|
|
358 |
&vd->vdev_ashift);
|
|
359 |
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
|
|
360 |
&vd->vdev_asize);
|
|
361 |
}
|
|
362 |
|
|
363 |
/*
|
|
364 |
* If we're a leaf vdev, try to load the DTL object.
|
|
365 |
*/
|
|
366 |
if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
|
|
367 |
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
|
|
368 |
&vd->vdev_dtl.smo_object);
|
|
369 |
}
|
|
370 |
|
|
371 |
/*
|
|
372 |
* Add ourselves to the parent's list of children.
|
|
373 |
*/
|
|
374 |
vdev_add_child(parent, vd);
|
|
375 |
|
|
376 |
return (vd);
|
|
377 |
}
|
|
378 |
|
|
379 |
void
|
|
380 |
vdev_free(vdev_t *vd)
|
|
381 |
{
|
|
382 |
int c;
|
|
383 |
|
|
384 |
/*
|
|
385 |
* vdev_free() implies closing the vdev first. This is simpler than
|
|
386 |
* trying to ensure complicated semantics for all callers.
|
|
387 |
*/
|
|
388 |
vdev_close(vd);
|
|
389 |
|
|
390 |
/*
|
|
391 |
* It's possible to free a vdev that's been added to the dirty
|
|
392 |
* list when in the middle of spa_vdev_add(). Handle that case
|
|
393 |
* correctly here.
|
|
394 |
*/
|
|
395 |
if (vd->vdev_is_dirty)
|
|
396 |
vdev_config_clean(vd);
|
|
397 |
|
|
398 |
/*
|
|
399 |
* Free all children.
|
|
400 |
*/
|
|
401 |
for (c = 0; c < vd->vdev_children; c++)
|
|
402 |
vdev_free(vd->vdev_child[c]);
|
|
403 |
|
|
404 |
ASSERT(vd->vdev_child == NULL);
|
|
405 |
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
|
|
406 |
|
|
407 |
/*
|
|
408 |
* Discard allocation state.
|
|
409 |
*/
|
|
410 |
if (vd == vd->vdev_top)
|
|
411 |
vdev_metaslab_fini(vd);
|
|
412 |
|
|
413 |
ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
|
|
414 |
ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
|
|
415 |
|
|
416 |
/*
|
|
417 |
* Remove this vdev from its parent's child list.
|
|
418 |
*/
|
|
419 |
vdev_remove_child(vd->vdev_parent, vd);
|
|
420 |
|
|
421 |
ASSERT(vd->vdev_parent == NULL);
|
|
422 |
|
|
423 |
vdev_free_common(vd);
|
|
424 |
}
|
|
425 |
|
|
426 |
/*
|
|
427 |
* Transfer top-level vdev state from svd to tvd.
|
|
428 |
*/
|
|
429 |
static void
|
|
430 |
vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
|
|
431 |
{
|
|
432 |
spa_t *spa = svd->vdev_spa;
|
|
433 |
metaslab_t *msp;
|
|
434 |
vdev_t *vd;
|
|
435 |
int t;
|
|
436 |
|
|
437 |
ASSERT(tvd == tvd->vdev_top);
|
|
438 |
|
|
439 |
tvd->vdev_ms_array = svd->vdev_ms_array;
|
|
440 |
tvd->vdev_ms_shift = svd->vdev_ms_shift;
|
|
441 |
tvd->vdev_ms_count = svd->vdev_ms_count;
|
|
442 |
|
|
443 |
svd->vdev_ms_array = 0;
|
|
444 |
svd->vdev_ms_shift = 0;
|
|
445 |
svd->vdev_ms_count = 0;
|
|
446 |
|
|
447 |
tvd->vdev_mg = svd->vdev_mg;
|
|
448 |
tvd->vdev_mg->mg_vd = tvd;
|
|
449 |
tvd->vdev_ms = svd->vdev_ms;
|
|
450 |
tvd->vdev_smo = svd->vdev_smo;
|
|
451 |
|
|
452 |
svd->vdev_mg = NULL;
|
|
453 |
svd->vdev_ms = NULL;
|
|
454 |
svd->vdev_smo = NULL;
|
|
455 |
|
|
456 |
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
|
|
457 |
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
|
|
458 |
|
|
459 |
svd->vdev_stat.vs_alloc = 0;
|
|
460 |
svd->vdev_stat.vs_space = 0;
|
|
461 |
|
|
462 |
for (t = 0; t < TXG_SIZE; t++) {
|
|
463 |
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
|
|
464 |
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
|
|
465 |
while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
|
|
466 |
(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
|
|
467 |
if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
|
|
468 |
(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
|
|
469 |
tvd->vdev_dirty[t] = svd->vdev_dirty[t];
|
|
470 |
svd->vdev_dirty[t] = 0;
|
|
471 |
}
|
|
472 |
|
|
473 |
if (svd->vdev_is_dirty) {
|
|
474 |
vdev_config_clean(svd);
|
|
475 |
vdev_config_dirty(tvd);
|
|
476 |
}
|
|
477 |
|
|
478 |
ASSERT(svd->vdev_io_retry == NULL);
|
|
479 |
ASSERT(list_is_empty(&svd->vdev_io_pending));
|
|
480 |
}
|
|
481 |
|
|
482 |
static void
|
|
483 |
vdev_top_update(vdev_t *tvd, vdev_t *vd)
|
|
484 |
{
|
|
485 |
int c;
|
|
486 |
|
|
487 |
if (vd == NULL)
|
|
488 |
return;
|
|
489 |
|
|
490 |
vd->vdev_top = tvd;
|
|
491 |
|
|
492 |
for (c = 0; c < vd->vdev_children; c++)
|
|
493 |
vdev_top_update(tvd, vd->vdev_child[c]);
|
|
494 |
}
|
|
495 |
|
|
496 |
/*
|
|
497 |
* Add a mirror/replacing vdev above an existing vdev.
|
|
498 |
*/
|
|
499 |
vdev_t *
|
|
500 |
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
|
|
501 |
{
|
|
502 |
spa_t *spa = cvd->vdev_spa;
|
|
503 |
vdev_t *pvd = cvd->vdev_parent;
|
|
504 |
vdev_t *mvd;
|
|
505 |
|
|
506 |
ASSERT(spa_config_held(spa, RW_WRITER));
|
|
507 |
|
|
508 |
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
|
|
509 |
vdev_remove_child(pvd, cvd);
|
|
510 |
vdev_add_child(pvd, mvd);
|
|
511 |
cvd->vdev_id = mvd->vdev_children;
|
|
512 |
vdev_add_child(mvd, cvd);
|
|
513 |
vdev_top_update(cvd->vdev_top, cvd->vdev_top);
|
|
514 |
|
|
515 |
mvd->vdev_asize = cvd->vdev_asize;
|
|
516 |
mvd->vdev_ashift = cvd->vdev_ashift;
|
|
517 |
mvd->vdev_state = cvd->vdev_state;
|
|
518 |
|
|
519 |
if (mvd == mvd->vdev_top)
|
|
520 |
vdev_top_transfer(cvd, mvd);
|
|
521 |
|
|
522 |
return (mvd);
|
|
523 |
}
|
|
524 |
|
|
525 |
/*
|
|
526 |
* Remove a 1-way mirror/replacing vdev from the tree.
|
|
527 |
*/
|
|
528 |
void
|
|
529 |
vdev_remove_parent(vdev_t *cvd)
|
|
530 |
{
|
|
531 |
vdev_t *mvd = cvd->vdev_parent;
|
|
532 |
vdev_t *pvd = mvd->vdev_parent;
|
|
533 |
|
|
534 |
ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
|
|
535 |
|
|
536 |
ASSERT(mvd->vdev_children == 1);
|
|
537 |
ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
|
|
538 |
mvd->vdev_ops == &vdev_replacing_ops);
|
|
539 |
|
|
540 |
vdev_remove_child(mvd, cvd);
|
|
541 |
vdev_remove_child(pvd, mvd);
|
|
542 |
cvd->vdev_id = mvd->vdev_id;
|
|
543 |
vdev_add_child(pvd, cvd);
|
|
544 |
vdev_top_update(cvd->vdev_top, cvd->vdev_top);
|
|
545 |
|
|
546 |
if (cvd == cvd->vdev_top)
|
|
547 |
vdev_top_transfer(mvd, cvd);
|
|
548 |
|
|
549 |
ASSERT(mvd->vdev_children == 0);
|
|
550 |
vdev_free(mvd);
|
|
551 |
}
|
|
552 |
|
|
553 |
void
|
|
554 |
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
|
|
555 |
{
|
|
556 |
spa_t *spa = vd->vdev_spa;
|
|
557 |
metaslab_class_t *mc = spa_metaslab_class_select(spa);
|
|
558 |
uint64_t c;
|
|
559 |
uint64_t oldc = vd->vdev_ms_count;
|
|
560 |
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
|
|
561 |
space_map_obj_t *smo = vd->vdev_smo;
|
|
562 |
metaslab_t **mspp = vd->vdev_ms;
|
|
563 |
|
|
564 |
dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
|
|
565 |
|
|
566 |
ASSERT(oldc <= newc);
|
|
567 |
|
|
568 |
vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
|
|
569 |
vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
|
|
570 |
vd->vdev_ms_count = newc;
|
|
571 |
|
|
572 |
if (vd->vdev_mg == NULL) {
|
|
573 |
if (txg == 0) {
|
|
574 |
dmu_buf_t *db;
|
|
575 |
uint64_t *ms_array;
|
|
576 |
|
|
577 |
ms_array = kmem_zalloc(newc * sizeof (uint64_t),
|
|
578 |
KM_SLEEP);
|
|
579 |
|
|
580 |
dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
|
|
581 |
0, newc * sizeof (uint64_t), ms_array);
|
|
582 |
|
|
583 |
for (c = 0; c < newc; c++) {
|
|
584 |
if (ms_array[c] == 0)
|
|
585 |
continue;
|
|
586 |
db = dmu_bonus_hold(spa->spa_meta_objset,
|
|
587 |
ms_array[c]);
|
|
588 |
dmu_buf_read(db);
|
|
589 |
ASSERT3U(db->db_size, ==, sizeof (*smo));
|
|
590 |
bcopy(db->db_data, &vd->vdev_smo[c],
|
|
591 |
db->db_size);
|
|
592 |
ASSERT3U(vd->vdev_smo[c].smo_object, ==,
|
|
593 |
ms_array[c]);
|
|
594 |
dmu_buf_rele(db);
|
|
595 |
}
|
|
596 |
kmem_free(ms_array, newc * sizeof (uint64_t));
|
|
597 |
}
|
|
598 |
vd->vdev_mg = metaslab_group_create(mc, vd);
|
|
599 |
}
|
|
600 |
|
|
601 |
for (c = 0; c < oldc; c++) {
|
|
602 |
vd->vdev_smo[c] = smo[c];
|
|
603 |
vd->vdev_ms[c] = mspp[c];
|
|
604 |
mspp[c]->ms_smo = &vd->vdev_smo[c];
|
|
605 |
}
|
|
606 |
|
|
607 |
for (c = oldc; c < newc; c++)
|
|
608 |
metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
|
|
609 |
c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
|
|
610 |
|
|
611 |
if (oldc != 0) {
|
|
612 |
kmem_free(smo, oldc * sizeof (*smo));
|
|
613 |
kmem_free(mspp, oldc * sizeof (*mspp));
|
|
614 |
}
|
|
615 |
|
|
616 |
}
|
|
617 |
|
|
618 |
void
|
|
619 |
vdev_metaslab_fini(vdev_t *vd)
|
|
620 |
{
|
|
621 |
uint64_t m;
|
|
622 |
uint64_t count = vd->vdev_ms_count;
|
|
623 |
|
|
624 |
if (vd->vdev_ms != NULL) {
|
|
625 |
for (m = 0; m < count; m++)
|
|
626 |
metaslab_fini(vd->vdev_ms[m]);
|
|
627 |
kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
|
|
628 |
vd->vdev_ms = NULL;
|
|
629 |
}
|
|
630 |
|
|
631 |
if (vd->vdev_smo != NULL) {
|
|
632 |
kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
|
|
633 |
vd->vdev_smo = NULL;
|
|
634 |
}
|
|
635 |
}
|
|
636 |
|
|
637 |
/*
|
|
638 |
* Prepare a virtual device for access.
|
|
639 |
*/
|
|
640 |
int
|
|
641 |
vdev_open(vdev_t *vd)
|
|
642 |
{
|
|
643 |
int error;
|
|
644 |
vdev_knob_t *vk;
|
|
645 |
int c;
|
|
646 |
uint64_t osize = 0;
|
|
647 |
uint64_t asize, psize;
|
|
648 |
uint64_t ashift = -1ULL;
|
|
649 |
|
|
650 |
ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
|
|
651 |
vd->vdev_state == VDEV_STATE_CANT_OPEN ||
|
|
652 |
vd->vdev_state == VDEV_STATE_OFFLINE);
|
|
653 |
|
|
654 |
if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
|
|
655 |
vd->vdev_fault_arg >>= 1;
|
|
656 |
else
|
|
657 |
vd->vdev_fault_mode = VDEV_FAULT_NONE;
|
|
658 |
|
|
659 |
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
|
|
660 |
|
|
661 |
for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
|
|
662 |
uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
|
|
663 |
|
|
664 |
*valp = vk->vk_default;
|
|
665 |
*valp = MAX(*valp, vk->vk_min);
|
|
666 |
*valp = MIN(*valp, vk->vk_max);
|
|
667 |
}
|
|
668 |
|
|
669 |
if (vd->vdev_ops->vdev_op_leaf) {
|
|
670 |
vdev_cache_init(vd);
|
|
671 |
vdev_queue_init(vd);
|
|
672 |
vd->vdev_cache_active = B_TRUE;
|
|
673 |
}
|
|
674 |
|
|
675 |
if (vd->vdev_offline) {
|
|
676 |
ASSERT(vd->vdev_children == 0);
|
|
677 |
dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
|
|
678 |
vd->vdev_state = VDEV_STATE_OFFLINE;
|
|
679 |
return (ENXIO);
|
|
680 |
}
|
|
681 |
|
|
682 |
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
|
|
683 |
|
|
684 |
dprintf("%s = %d, osize %llu, state = %d\n",
|
|
685 |
vdev_description(vd), error, osize, vd->vdev_state);
|
|
686 |
|
|
687 |
if (error) {
|
|
688 |
dprintf("%s in %s failed to open, error %d, aux %d\n",
|
|
689 |
vdev_description(vd),
|
|
690 |
vdev_description(vd->vdev_parent),
|
|
691 |
error,
|
|
692 |
vd->vdev_stat.vs_aux);
|
|
693 |
|
|
694 |
vd->vdev_state = VDEV_STATE_CANT_OPEN;
|
|
695 |
return (error);
|
|
696 |
}
|
|
697 |
|
|
698 |
vd->vdev_state = VDEV_STATE_HEALTHY;
|
|
699 |
|
|
700 |
for (c = 0; c < vd->vdev_children; c++)
|
|
701 |
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
|
|
702 |
vd->vdev_state = VDEV_STATE_DEGRADED;
|
|
703 |
|
|
704 |
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
|
|
705 |
|
|
706 |
if (vd->vdev_children == 0) {
|
|
707 |
if (osize < SPA_MINDEVSIZE) {
|
|
708 |
vd->vdev_state = VDEV_STATE_CANT_OPEN;
|
|
709 |
vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
|
|
710 |
return (EOVERFLOW);
|
|
711 |
}
|
|
712 |
psize = osize;
|
|
713 |
asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
|
|
714 |
} else {
|
|
715 |
if (osize < SPA_MINDEVSIZE -
|
|
716 |
(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
|
|
717 |
vd->vdev_state = VDEV_STATE_CANT_OPEN;
|
|
718 |
vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
|
|
719 |
return (EOVERFLOW);
|
|
720 |
}
|
|
721 |
psize = 0;
|
|
722 |
asize = osize;
|
|
723 |
}
|
|
724 |
|
|
725 |
vd->vdev_psize = psize;
|
|
726 |
|
|
727 |
if (vd->vdev_asize == 0) {
|
|
728 |
/*
|
|
729 |
* This is the first-ever open, so use the computed values.
|
|
730 |
*/
|
|
731 |
vd->vdev_asize = asize;
|
|
732 |
vd->vdev_ashift = ashift;
|
|
733 |
} else {
|
|
734 |
/*
|
|
735 |
* Make sure the alignment requirement hasn't increased.
|
|
736 |
*/
|
|
737 |
if (ashift > vd->vdev_ashift) {
|
|
738 |
dprintf("%s: ashift grew\n", vdev_description(vd));
|
|
739 |
vd->vdev_state = VDEV_STATE_CANT_OPEN;
|
|
740 |
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
|
|
741 |
return (EINVAL);
|
|
742 |
}
|
|
743 |
|
|
744 |
/*
|
|
745 |
* Make sure the device hasn't shrunk.
|
|
746 |
*/
|
|
747 |
if (asize < vd->vdev_asize) {
|
|
748 |
dprintf("%s: device shrank\n", vdev_description(vd));
|
|
749 |
vd->vdev_state = VDEV_STATE_CANT_OPEN;
|
|
750 |
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
|
|
751 |
return (EINVAL);
|
|
752 |
}
|
|
753 |
|
|
754 |
/*
|
|
755 |
* If all children are healthy and the asize has increased,
|
|
756 |
* then we've experienced dynamic LUN growth.
|
|
757 |
*/
|
|
758 |
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
|
|
759 |
asize > vd->vdev_asize) {
|
|
760 |
dprintf("%s: device grew\n", vdev_description(vd));
|
|
761 |
vd->vdev_asize = asize;
|
|
762 |
}
|
|
763 |
}
|
|
764 |
|
|
765 |
return (0);
|
|
766 |
}
|
|
767 |
|
|
768 |
/*
|
|
769 |
* Close a virtual device.
|
|
770 |
*/
|
|
771 |
void
|
|
772 |
vdev_close(vdev_t *vd)
|
|
773 |
{
|
|
774 |
ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
|
|
775 |
|
|
776 |
vd->vdev_ops->vdev_op_close(vd);
|
|
777 |
|
|
778 |
if (vd->vdev_cache_active) {
|
|
779 |
vdev_cache_fini(vd);
|
|
780 |
vdev_queue_fini(vd);
|
|
781 |
vd->vdev_cache_active = B_FALSE;
|
|
782 |
}
|
|
783 |
|
|
784 |
if (vd->vdev_offline)
|
|
785 |
vd->vdev_state = VDEV_STATE_OFFLINE;
|
|
786 |
else
|
|
787 |
vd->vdev_state = VDEV_STATE_CLOSED;
|
|
788 |
}
|
|
789 |
|
|
790 |
void
|
|
791 |
vdev_reopen(vdev_t *vd, zio_t **rq)
|
|
792 |
{
|
|
793 |
vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
|
|
794 |
int c;
|
|
795 |
|
|
796 |
if (vd == rvd) {
|
|
797 |
ASSERT(rq == NULL);
|
|
798 |
for (c = 0; c < rvd->vdev_children; c++)
|
|
799 |
vdev_reopen(rvd->vdev_child[c], NULL);
|
|
800 |
return;
|
|
801 |
}
|
|
802 |
|
|
803 |
/* only valid for top-level vdevs */
|
|
804 |
ASSERT3P(vd, ==, vd->vdev_top);
|
|
805 |
|
|
806 |
/*
|
|
807 |
* vdev_state can change when spa_config_lock is held as writer,
|
|
808 |
* or when it's held as reader and we're doing a vdev_reopen().
|
|
809 |
* To handle the latter case, we grab rvd's io_lock to serialize
|
|
810 |
* reopens. This ensures that there's never more than one vdev
|
|
811 |
* state changer active at a time.
|
|
812 |
*/
|
|
813 |
mutex_enter(&rvd->vdev_io_lock);
|
|
814 |
|
|
815 |
mutex_enter(&vd->vdev_io_lock);
|
|
816 |
while (list_head(&vd->vdev_io_pending) != NULL)
|
|
817 |
cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
|
|
818 |
vdev_close(vd);
|
|
819 |
(void) vdev_open(vd);
|
|
820 |
if (rq != NULL) {
|
|
821 |
*rq = vd->vdev_io_retry;
|
|
822 |
vd->vdev_io_retry = NULL;
|
|
823 |
}
|
|
824 |
mutex_exit(&vd->vdev_io_lock);
|
|
825 |
|
|
826 |
/*
|
|
827 |
* Reassess root vdev's health.
|
|
828 |
*/
|
|
829 |
rvd->vdev_state = VDEV_STATE_HEALTHY;
|
|
830 |
for (c = 0; c < rvd->vdev_children; c++) {
|
|
831 |
uint64_t state = rvd->vdev_child[c]->vdev_state;
|
|
832 |
rvd->vdev_state = MIN(rvd->vdev_state, state);
|
|
833 |
}
|
|
834 |
|
|
835 |
mutex_exit(&rvd->vdev_io_lock);
|
|
836 |
}
|
|
837 |
|
|
838 |
int
|
|
839 |
vdev_create(vdev_t *vd, uint64_t txg)
|
|
840 |
{
|
|
841 |
int error;
|
|
842 |
|
|
843 |
/*
|
|
844 |
* Normally, partial opens (e.g. of a mirror) are allowed.
|
|
845 |
* For a create, however, we want to fail the request if
|
|
846 |
* there are any components we can't open.
|
|
847 |
*/
|
|
848 |
error = vdev_open(vd);
|
|
849 |
|
|
850 |
if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
|
|
851 |
vdev_close(vd);
|
|
852 |
return (error ? error : ENXIO);
|
|
853 |
}
|
|
854 |
|
|
855 |
/*
|
|
856 |
* Recursively initialize all labels.
|
|
857 |
*/
|
|
858 |
if ((error = vdev_label_init(vd, txg)) != 0) {
|
|
859 |
vdev_close(vd);
|
|
860 |
return (error);
|
|
861 |
}
|
|
862 |
|
|
863 |
return (0);
|
|
864 |
}
|
|
865 |
|
|
866 |
/*
|
|
867 |
* The is the latter half of vdev_create(). It is distinct because it
|
|
868 |
* involves initiating transactions in order to do metaslab creation.
|
|
869 |
* For creation, we want to try to create all vdevs at once and then undo it
|
|
870 |
* if anything fails; this is much harder if we have pending transactions.
|
|
871 |
*/
|
|
872 |
void
|
|
873 |
vdev_init(vdev_t *vd, uint64_t txg)
|
|
874 |
{
|
|
875 |
/*
|
|
876 |
* Aim for roughly 200 metaslabs per vdev.
|
|
877 |
*/
|
|
878 |
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
|
|
879 |
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
|
|
880 |
|
|
881 |
/*
|
|
882 |
* Initialize the vdev's metaslabs.
|
|
883 |
*/
|
|
884 |
vdev_metaslab_init(vd, txg);
|
|
885 |
}
|
|
886 |
|
|
887 |
void
|
|
888 |
vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
|
|
889 |
{
|
|
890 |
vdev_t *tvd = vd->vdev_top;
|
|
891 |
|
|
892 |
mutex_enter(&tvd->vdev_dirty_lock);
|
|
893 |
if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
|
|
894 |
tvd->vdev_dirty[txg & TXG_MASK] |= flags;
|
|
895 |
(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
|
|
896 |
tvd, txg);
|
|
897 |
}
|
|
898 |
mutex_exit(&tvd->vdev_dirty_lock);
|
|
899 |
}
|
|
900 |
|
|
901 |
void
|
|
902 |
vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
|
|
903 |
{
|
|
904 |
mutex_enter(sm->sm_lock);
|
|
905 |
if (!space_map_contains(sm, txg, size))
|
|
906 |
space_map_add(sm, txg, size);
|
|
907 |
mutex_exit(sm->sm_lock);
|
|
908 |
}
|
|
909 |
|
|
910 |
int
|
|
911 |
vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
|
|
912 |
{
|
|
913 |
int dirty;
|
|
914 |
|
|
915 |
/*
|
|
916 |
* Quick test without the lock -- covers the common case that
|
|
917 |
* there are no dirty time segments.
|
|
918 |
*/
|
|
919 |
if (sm->sm_space == 0)
|
|
920 |
return (0);
|
|
921 |
|
|
922 |
mutex_enter(sm->sm_lock);
|
|
923 |
dirty = space_map_contains(sm, txg, size);
|
|
924 |
mutex_exit(sm->sm_lock);
|
|
925 |
|
|
926 |
return (dirty);
|
|
927 |
}
|
|
928 |
|
|
929 |
/*
|
|
930 |
* Reassess DTLs after a config change or scrub completion.
|
|
931 |
*/
|
|
932 |
void
|
|
933 |
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
|
|
934 |
{
|
|
935 |
int c;
|
|
936 |
|
|
937 |
ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
|
|
938 |
|
|
939 |
if (vd->vdev_children == 0) {
|
|
940 |
mutex_enter(&vd->vdev_dtl_lock);
|
|
941 |
/*
|
|
942 |
* We're successfully scrubbed everything up to scrub_txg.
|
|
943 |
* Therefore, excise all old DTLs up to that point, then
|
|
944 |
* fold in the DTLs for everything we couldn't scrub.
|
|
945 |
*/
|
|
946 |
if (scrub_txg != 0) {
|
|
947 |
space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
|
|
948 |
space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
|
|
949 |
}
|
|
950 |
if (scrub_done)
|
|
951 |
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
|
|
952 |
mutex_exit(&vd->vdev_dtl_lock);
|
|
953 |
if (txg != 0) {
|
|
954 |
vdev_t *tvd = vd->vdev_top;
|
|
955 |
vdev_dirty(tvd, VDD_DTL, txg);
|
|
956 |
(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
|
|
957 |
}
|
|
958 |
return;
|
|
959 |
}
|
|
960 |
|
|
961 |
mutex_enter(&vd->vdev_dtl_lock);
|
|
962 |
space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
|
|
963 |
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
|
|
964 |
mutex_exit(&vd->vdev_dtl_lock);
|
|
965 |
|
|
966 |
for (c = 0; c < vd->vdev_children; c++) {
|
|
967 |
vdev_t *cvd = vd->vdev_child[c];
|
|
968 |
vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
|
|
969 |
mutex_enter(&vd->vdev_dtl_lock);
|
|
970 |
space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
|
|
971 |
space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
|
|
972 |
mutex_exit(&vd->vdev_dtl_lock);
|
|
973 |
}
|
|
974 |
}
|
|
975 |
|
|
976 |
static int
|
|
977 |
vdev_dtl_load(vdev_t *vd)
|
|
978 |
{
|
|
979 |
spa_t *spa = vd->vdev_spa;
|
|
980 |
space_map_obj_t *smo = &vd->vdev_dtl;
|
|
981 |
dmu_buf_t *db;
|
|
982 |
int error;
|
|
983 |
|
|
984 |
ASSERT(vd->vdev_children == 0);
|
|
985 |
|
|
986 |
if (smo->smo_object == 0)
|
|
987 |
return (0);
|
|
988 |
|
|
989 |
db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
|
|
990 |
dmu_buf_read(db);
|
|
991 |
ASSERT3U(db->db_size, ==, sizeof (*smo));
|
|
992 |
bcopy(db->db_data, smo, db->db_size);
|
|
993 |
dmu_buf_rele(db);
|
|
994 |
|
|
995 |
mutex_enter(&vd->vdev_dtl_lock);
|
|
996 |
error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
|
|
997 |
spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
|
|
998 |
mutex_exit(&vd->vdev_dtl_lock);
|
|
999 |
|
|
1000 |
return (error);
|
|
1001 |
}
|
|
1002 |
|
|
1003 |
void
|
|
1004 |
vdev_dtl_sync(vdev_t *vd, uint64_t txg)
|
|
1005 |
{
|
|
1006 |
spa_t *spa = vd->vdev_spa;
|
|
1007 |
space_map_obj_t *smo = &vd->vdev_dtl;
|
|
1008 |
space_map_t *sm = &vd->vdev_dtl_map;
|
|
1009 |
space_map_t smsync;
|
|
1010 |
kmutex_t smlock;
|
|
1011 |
avl_tree_t *t = &sm->sm_root;
|
|
1012 |
space_seg_t *ss;
|
|
1013 |
dmu_buf_t *db;
|
|
1014 |
dmu_tx_t *tx;
|
|
1015 |
|
|
1016 |
dprintf("%s in txg %llu pass %d\n",
|
|
1017 |
vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
|
|
1018 |
|
|
1019 |
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
|
1020 |
|
|
1021 |
if (vd->vdev_detached) {
|
|
1022 |
if (smo->smo_object != 0) {
|
|
1023 |
int err = dmu_object_free(spa->spa_meta_objset,
|
|
1024 |
smo->smo_object, tx);
|
|
1025 |
ASSERT3U(err, ==, 0);
|
|
1026 |
smo->smo_object = 0;
|
|
1027 |
}
|
|
1028 |
dmu_tx_commit(tx);
|
|
1029 |
return;
|
|
1030 |
}
|
|
1031 |
|
|
1032 |
if (smo->smo_object == 0) {
|
|
1033 |
ASSERT(smo->smo_objsize == 0);
|
|
1034 |
ASSERT(smo->smo_alloc == 0);
|
|
1035 |
smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
|
|
1036 |
DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
|
|
1037 |
DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
|
|
1038 |
ASSERT(smo->smo_object != 0);
|
|
1039 |
vdev_config_dirty(vd->vdev_top);
|
|
1040 |
}
|
|
1041 |
|
|
1042 |
dmu_free_range(spa->spa_meta_objset, smo->smo_object,
|
|
1043 |
0, smo->smo_objsize, tx);
|
|
1044 |
|
|
1045 |
mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
|
|
1046 |
|
|
1047 |
space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
|
|
1048 |
&smlock);
|
|
1049 |
|
|
1050 |
mutex_enter(&smlock);
|
|
1051 |
|
|
1052 |
mutex_enter(&vd->vdev_dtl_lock);
|
|
1053 |
for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
|
|
1054 |
space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
|
|
1055 |
mutex_exit(&vd->vdev_dtl_lock);
|
|
1056 |
|
|
1057 |
smo->smo_objsize = 0;
|
|
1058 |
smo->smo_alloc = smsync.sm_space;
|
|
1059 |
|
|
1060 |
space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
|
|
1061 |
space_map_destroy(&smsync);
|
|
1062 |
|
|
1063 |
mutex_exit(&smlock);
|
|
1064 |
mutex_destroy(&smlock);
|
|
1065 |
|
|
1066 |
db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
|
|
1067 |
dmu_buf_will_dirty(db, tx);
|
|
1068 |
ASSERT3U(db->db_size, ==, sizeof (*smo));
|
|
1069 |
bcopy(smo, db->db_data, db->db_size);
|
|
1070 |
dmu_buf_rele(db);
|
|
1071 |
|
|
1072 |
dmu_tx_commit(tx);
|
|
1073 |
}
|
|
1074 |
|
|
1075 |
int
|
|
1076 |
vdev_load(vdev_t *vd, int import)
|
|
1077 |
{
|
|
1078 |
spa_t *spa = vd->vdev_spa;
|
|
1079 |
int c, error;
|
|
1080 |
nvlist_t *label;
|
|
1081 |
uint64_t guid, state;
|
|
1082 |
|
|
1083 |
dprintf("loading %s\n", vdev_description(vd));
|
|
1084 |
|
|
1085 |
/*
|
|
1086 |
* Recursively load all children.
|
|
1087 |
*/
|
|
1088 |
for (c = 0; c < vd->vdev_children; c++)
|
|
1089 |
if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
|
|
1090 |
return (error);
|
|
1091 |
|
|
1092 |
/*
|
|
1093 |
* If this is a leaf vdev, make sure its agrees with its disk labels.
|
|
1094 |
*/
|
|
1095 |
if (vd->vdev_ops->vdev_op_leaf) {
|
|
1096 |
|
|
1097 |
if (vdev_is_dead(vd))
|
|
1098 |
return (0);
|
|
1099 |
|
|
1100 |
/*
|
|
1101 |
* XXX state transitions don't propagate to parent here.
|
|
1102 |
* Also, merely setting the state isn't sufficient because
|
|
1103 |
* it's not persistent; a vdev_reopen() would make us
|
|
1104 |
* forget all about it.
|
|
1105 |
*/
|
|
1106 |
if ((label = vdev_label_read_config(vd)) == NULL) {
|
|
1107 |
dprintf("can't load label config\n");
|
|
1108 |
vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
|
|
1109 |
VDEV_AUX_CORRUPT_DATA);
|
|
1110 |
return (0);
|
|
1111 |
}
|
|
1112 |
|
|
1113 |
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
|
|
1114 |
&guid) != 0 || guid != spa_guid(spa)) {
|
|
1115 |
dprintf("bad or missing pool GUID (%llu)\n", guid);
|
|
1116 |
vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
|
|
1117 |
VDEV_AUX_CORRUPT_DATA);
|
|
1118 |
nvlist_free(label);
|
|
1119 |
return (0);
|
|
1120 |
}
|
|
1121 |
|
|
1122 |
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
|
|
1123 |
guid != vd->vdev_guid) {
|
|
1124 |
dprintf("bad or missing vdev guid (%llu != %llu)\n",
|
|
1125 |
guid, vd->vdev_guid);
|
|
1126 |
vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
|
|
1127 |
VDEV_AUX_CORRUPT_DATA);
|
|
1128 |
nvlist_free(label);
|
|
1129 |
return (0);
|
|
1130 |
}
|
|
1131 |
|
|
1132 |
/*
|
|
1133 |
* If we find a vdev with a matching pool guid and vdev guid,
|
|
1134 |
* but the pool state is not active, it indicates that the user
|
|
1135 |
* exported or destroyed the pool without affecting the config
|
|
1136 |
* cache (if / was mounted readonly, for example). In this
|
|
1137 |
* case, immediately return EBADF so the caller can remove it
|
|
1138 |
* from the config.
|
|
1139 |
*/
|
|
1140 |
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
|
|
1141 |
&state)) {
|
|
1142 |
dprintf("missing pool state\n");
|
|
1143 |
vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
|
|
1144 |
VDEV_AUX_CORRUPT_DATA);
|
|
1145 |
nvlist_free(label);
|
|
1146 |
return (0);
|
|
1147 |
}
|
|
1148 |
|
|
1149 |
if (state != POOL_STATE_ACTIVE &&
|
|
1150 |
(!import || state != POOL_STATE_EXPORTED)) {
|
|
1151 |
dprintf("pool state not active (%llu)\n", state);
|
|
1152 |
nvlist_free(label);
|
|
1153 |
return (EBADF);
|
|
1154 |
}
|
|
1155 |
|
|
1156 |
nvlist_free(label);
|
|
1157 |
}
|
|
1158 |
|
|
1159 |
/*
|
|
1160 |
* If this is a top-level vdev, make sure its allocation parameters
|
|
1161 |
* exist and initialize its metaslabs.
|
|
1162 |
*/
|
|
1163 |
if (vd == vd->vdev_top) {
|
|
1164 |
|
|
1165 |
if (vd->vdev_ms_array == 0 ||
|
|
1166 |
vd->vdev_ms_shift == 0 ||
|
|
1167 |
vd->vdev_ashift == 0 ||
|
|
1168 |
vd->vdev_asize == 0) {
|
|
1169 |
vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
|
|
1170 |
VDEV_AUX_CORRUPT_DATA);
|
|
1171 |
return (0);
|
|
1172 |
}
|
|
1173 |
|
|
1174 |
vdev_metaslab_init(vd, 0);
|
|
1175 |
}
|
|
1176 |
|
|
1177 |
/*
|
|
1178 |
* If this is a leaf vdev, load its DTL.
|
|
1179 |
*/
|
|
1180 |
if (vd->vdev_ops->vdev_op_leaf) {
|
|
1181 |
error = vdev_dtl_load(vd);
|
|
1182 |
if (error) {
|
|
1183 |
dprintf("can't load DTL for %s, error %d\n",
|
|
1184 |
vdev_description(vd), error);
|
|
1185 |
vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
|
|
1186 |
VDEV_AUX_CORRUPT_DATA);
|
|
1187 |
return (0);
|
|
1188 |
}
|
|
1189 |
}
|
|
1190 |
|
|
1191 |
return (0);
|
|
1192 |
}
|
|
1193 |
|
|
1194 |
void
|
|
1195 |
vdev_sync_done(vdev_t *vd, uint64_t txg)
|
|
1196 |
{
|
|
1197 |
metaslab_t *msp;
|
|
1198 |
|
|
1199 |
dprintf("%s txg %llu\n", vdev_description(vd), txg);
|
|
1200 |
|
|
1201 |
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
|
|
1202 |
metaslab_sync_done(msp, txg);
|
|
1203 |
}
|
|
1204 |
|
|
1205 |
void
|
|
1206 |
vdev_add_sync(vdev_t *vd, uint64_t txg)
|
|
1207 |
{
|
|
1208 |
spa_t *spa = vd->vdev_spa;
|
|
1209 |
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
|
1210 |
|
|
1211 |
ASSERT(vd == vd->vdev_top);
|
|
1212 |
|
|
1213 |
if (vd->vdev_ms_array == 0)
|
|
1214 |
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
|
|
1215 |
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
|
|
1216 |
|
|
1217 |
ASSERT(vd->vdev_ms_array != 0);
|
|
1218 |
|
|
1219 |
vdev_config_dirty(vd);
|
|
1220 |
|
|
1221 |
dmu_tx_commit(tx);
|
|
1222 |
}
|
|
1223 |
|
|
1224 |
void
|
|
1225 |
vdev_sync(vdev_t *vd, uint64_t txg)
|
|
1226 |
{
|
|
1227 |
spa_t *spa = vd->vdev_spa;
|
|
1228 |
vdev_t *lvd;
|
|
1229 |
metaslab_t *msp;
|
|
1230 |
uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
|
|
1231 |
uint8_t dirty = *dirtyp;
|
|
1232 |
|
|
1233 |
mutex_enter(&vd->vdev_dirty_lock);
|
|
1234 |
*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
|
|
1235 |
mutex_exit(&vd->vdev_dirty_lock);
|
|
1236 |
|
|
1237 |
dprintf("%s txg %llu pass %d\n",
|
|
1238 |
vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
|
|
1239 |
|
|
1240 |
if (dirty & VDD_ADD)
|
|
1241 |
vdev_add_sync(vd, txg);
|
|
1242 |
|
|
1243 |
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
|
|
1244 |
metaslab_sync(msp, txg);
|
|
1245 |
|
|
1246 |
while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
|
|
1247 |
vdev_dtl_sync(lvd, txg);
|
|
1248 |
|
|
1249 |
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
|
|
1250 |
}
|
|
1251 |
|
|
1252 |
uint64_t
|
|
1253 |
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
|
|
1254 |
{
|
|
1255 |
return (vd->vdev_ops->vdev_op_asize(vd, psize));
|
|
1256 |
}
|
|
1257 |
|
|
1258 |
void
|
|
1259 |
vdev_io_start(zio_t *zio)
|
|
1260 |
{
|
|
1261 |
zio->io_vd->vdev_ops->vdev_op_io_start(zio);
|
|
1262 |
}
|
|
1263 |
|
|
1264 |
void
|
|
1265 |
vdev_io_done(zio_t *zio)
|
|
1266 |
{
|
|
1267 |
zio->io_vd->vdev_ops->vdev_op_io_done(zio);
|
|
1268 |
}
|
|
1269 |
|
|
1270 |
const char *
|
|
1271 |
vdev_description(vdev_t *vd)
|
|
1272 |
{
|
|
1273 |
if (vd == NULL || vd->vdev_ops == NULL)
|
|
1274 |
return ("<unknown>");
|
|
1275 |
|
|
1276 |
if (vd->vdev_path != NULL)
|
|
1277 |
return (vd->vdev_path);
|
|
1278 |
|
|
1279 |
if (vd->vdev_parent == NULL)
|
|
1280 |
return (spa_name(vd->vdev_spa));
|
|
1281 |
|
|
1282 |
return (vd->vdev_ops->vdev_op_type);
|
|
1283 |
}
|
|
1284 |
|
|
1285 |
int
|
|
1286 |
vdev_online(spa_t *spa, const char *path)
|
|
1287 |
{
|
|
1288 |
vdev_t *vd;
|
|
1289 |
|
|
1290 |
spa_config_enter(spa, RW_WRITER);
|
|
1291 |
|
|
1292 |
if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
|
|
1293 |
spa_config_exit(spa);
|
|
1294 |
return (ENODEV);
|
|
1295 |
}
|
|
1296 |
|
|
1297 |
dprintf("ONLINE: %s\n", vdev_description(vd));
|
|
1298 |
|
|
1299 |
vd->vdev_offline = B_FALSE;
|
|
1300 |
|
|
1301 |
/*
|
|
1302 |
* Clear the error counts. The idea is that you expect to see all
|
|
1303 |
* zeroes when everything is working, so if you've just onlined a
|
|
1304 |
* device, you don't want to keep hearing about errors from before.
|
|
1305 |
*/
|
|
1306 |
vd->vdev_stat.vs_read_errors = 0;
|
|
1307 |
vd->vdev_stat.vs_write_errors = 0;
|
|
1308 |
vd->vdev_stat.vs_checksum_errors = 0;
|
|
1309 |
|
|
1310 |
vdev_reopen(vd->vdev_top, NULL);
|
|
1311 |
|
|
1312 |
spa_config_exit(spa);
|
|
1313 |
|
|
1314 |
VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
|
|
1315 |
|
|
1316 |
return (0);
|
|
1317 |
}
|
|
1318 |
|
|
1319 |
int
|
|
1320 |
vdev_offline(spa_t *spa, const char *path)
|
|
1321 |
{
|
|
1322 |
vdev_t *vd;
|
|
1323 |
|
|
1324 |
spa_config_enter(spa, RW_WRITER);
|
|
1325 |
|
|
1326 |
if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
|
|
1327 |
spa_config_exit(spa);
|
|
1328 |
return (ENODEV);
|
|
1329 |
}
|
|
1330 |
|
|
1331 |
dprintf("OFFLINE: %s\n", vdev_description(vd));
|
|
1332 |
|
|
1333 |
/*
|
|
1334 |
* If this device's top-level vdev has a non-empty DTL,
|
|
1335 |
* don't allow the device to be offlined.
|
|
1336 |
*
|
|
1337 |
* XXX -- we should make this more precise by allowing the offline
|
|
1338 |
* as long as the remaining devices don't have any DTL holes.
|
|
1339 |
*/
|
|
1340 |
if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
|
|
1341 |
spa_config_exit(spa);
|
|
1342 |
return (EBUSY);
|
|
1343 |
}
|
|
1344 |
|
|
1345 |
/*
|
|
1346 |
* Set this device to offline state and reopen its top-level vdev.
|
|
1347 |
* If this action results in the top-level vdev becoming unusable,
|
|
1348 |
* undo it and fail the request.
|
|
1349 |
*/
|
|
1350 |
vd->vdev_offline = B_TRUE;
|
|
1351 |
vdev_reopen(vd->vdev_top, NULL);
|
|
1352 |
if (vdev_is_dead(vd->vdev_top)) {
|
|
1353 |
vd->vdev_offline = B_FALSE;
|
|
1354 |
vdev_reopen(vd->vdev_top, NULL);
|
|
1355 |
spa_config_exit(spa);
|
|
1356 |
return (EBUSY);
|
|
1357 |
}
|
|
1358 |
|
|
1359 |
spa_config_exit(spa);
|
|
1360 |
|
|
1361 |
return (0);
|
|
1362 |
}
|
|
1363 |
|
|
1364 |
int
|
|
1365 |
vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
|
|
1366 |
{
|
|
1367 |
vdev_t *vd;
|
|
1368 |
|
|
1369 |
spa_config_enter(spa, RW_WRITER);
|
|
1370 |
|
|
1371 |
if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
|
|
1372 |
spa_config_exit(spa);
|
|
1373 |
return (ENODEV);
|
|
1374 |
}
|
|
1375 |
|
|
1376 |
vd->vdev_fault_mode = mode;
|
|
1377 |
vd->vdev_fault_mask = mask;
|
|
1378 |
vd->vdev_fault_arg = arg;
|
|
1379 |
|
|
1380 |
spa_config_exit(spa);
|
|
1381 |
|
|
1382 |
return (0);
|
|
1383 |
}
|
|
1384 |
|
|
1385 |
int
|
|
1386 |
vdev_is_dead(vdev_t *vd)
|
|
1387 |
{
|
|
1388 |
return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
|
|
1389 |
}
|
|
1390 |
|
|
1391 |
int
|
|
1392 |
vdev_error_inject(vdev_t *vd, zio_t *zio)
|
|
1393 |
{
|
|
1394 |
int error = 0;
|
|
1395 |
|
|
1396 |
if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
|
|
1397 |
return (0);
|
|
1398 |
|
|
1399 |
if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
|
|
1400 |
return (0);
|
|
1401 |
|
|
1402 |
switch (vd->vdev_fault_mode) {
|
|
1403 |
case VDEV_FAULT_RANDOM:
|
|
1404 |
if (spa_get_random(vd->vdev_fault_arg) == 0)
|
|
1405 |
error = EIO;
|
|
1406 |
break;
|
|
1407 |
|
|
1408 |
case VDEV_FAULT_COUNT:
|
|
1409 |
if ((int64_t)--vd->vdev_fault_arg <= 0)
|
|
1410 |
vd->vdev_fault_mode = VDEV_FAULT_NONE;
|
|
1411 |
error = EIO;
|
|
1412 |
break;
|
|
1413 |
}
|
|
1414 |
|
|
1415 |
if (error != 0) {
|
|
1416 |
dprintf("returning %d for type %d on %s state %d offset %llx\n",
|
|
1417 |
error, zio->io_type, vdev_description(vd),
|
|
1418 |
vd->vdev_state, zio->io_offset);
|
|
1419 |
}
|
|
1420 |
|
|
1421 |
return (error);
|
|
1422 |
}
|
|
1423 |
|
|
1424 |
/*
|
|
1425 |
* Get statistics for the given vdev.
|
|
1426 |
*/
|
|
1427 |
void
|
|
1428 |
vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
|
|
1429 |
{
|
|
1430 |
vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
|
|
1431 |
int c, t;
|
|
1432 |
|
|
1433 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1434 |
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
|
|
1435 |
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
|
|
1436 |
vs->vs_state = vd->vdev_state;
|
|
1437 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1438 |
|
|
1439 |
/*
|
|
1440 |
* If we're getting stats on the root vdev, aggregate the I/O counts
|
|
1441 |
* over all top-level vdevs (i.e. the direct children of the root).
|
|
1442 |
*/
|
|
1443 |
if (vd == rvd) {
|
|
1444 |
for (c = 0; c < rvd->vdev_children; c++) {
|
|
1445 |
vdev_t *cvd = rvd->vdev_child[c];
|
|
1446 |
vdev_stat_t *cvs = &cvd->vdev_stat;
|
|
1447 |
|
|
1448 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1449 |
for (t = 0; t < ZIO_TYPES; t++) {
|
|
1450 |
vs->vs_ops[t] += cvs->vs_ops[t];
|
|
1451 |
vs->vs_bytes[t] += cvs->vs_bytes[t];
|
|
1452 |
}
|
|
1453 |
vs->vs_read_errors += cvs->vs_read_errors;
|
|
1454 |
vs->vs_write_errors += cvs->vs_write_errors;
|
|
1455 |
vs->vs_checksum_errors += cvs->vs_checksum_errors;
|
|
1456 |
vs->vs_scrub_examined += cvs->vs_scrub_examined;
|
|
1457 |
vs->vs_scrub_errors += cvs->vs_scrub_errors;
|
|
1458 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1459 |
}
|
|
1460 |
}
|
|
1461 |
}
|
|
1462 |
|
|
1463 |
void
|
|
1464 |
vdev_stat_update(zio_t *zio)
|
|
1465 |
{
|
|
1466 |
vdev_t *vd = zio->io_vd;
|
|
1467 |
vdev_t *pvd;
|
|
1468 |
uint64_t txg = zio->io_txg;
|
|
1469 |
vdev_stat_t *vs = &vd->vdev_stat;
|
|
1470 |
zio_type_t type = zio->io_type;
|
|
1471 |
int flags = zio->io_flags;
|
|
1472 |
|
|
1473 |
if (zio->io_error == 0) {
|
|
1474 |
if (!(flags & ZIO_FLAG_IO_BYPASS)) {
|
|
1475 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1476 |
vs->vs_ops[type]++;
|
|
1477 |
vs->vs_bytes[type] += zio->io_size;
|
|
1478 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1479 |
}
|
|
1480 |
if ((flags & ZIO_FLAG_IO_REPAIR) &&
|
|
1481 |
zio->io_delegate_list == NULL) {
|
|
1482 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1483 |
if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
|
|
1484 |
vs->vs_scrub_repaired += zio->io_size;
|
|
1485 |
else
|
|
1486 |
vs->vs_self_healed += zio->io_size;
|
|
1487 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1488 |
}
|
|
1489 |
return;
|
|
1490 |
}
|
|
1491 |
|
|
1492 |
if (flags & ZIO_FLAG_SPECULATIVE)
|
|
1493 |
return;
|
|
1494 |
|
|
1495 |
if (!vdev_is_dead(vd)) {
|
|
1496 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1497 |
if (type == ZIO_TYPE_READ) {
|
|
1498 |
if (zio->io_error == ECKSUM)
|
|
1499 |
vs->vs_checksum_errors++;
|
|
1500 |
else
|
|
1501 |
vs->vs_read_errors++;
|
|
1502 |
}
|
|
1503 |
if (type == ZIO_TYPE_WRITE)
|
|
1504 |
vs->vs_write_errors++;
|
|
1505 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1506 |
}
|
|
1507 |
|
|
1508 |
if (type == ZIO_TYPE_WRITE) {
|
|
1509 |
if (txg == 0 || vd->vdev_children != 0)
|
|
1510 |
return;
|
|
1511 |
if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
|
|
1512 |
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
|
|
1513 |
for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
|
|
1514 |
vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
|
|
1515 |
}
|
|
1516 |
if (!(flags & ZIO_FLAG_IO_REPAIR)) {
|
|
1517 |
vdev_t *tvd = vd->vdev_top;
|
|
1518 |
if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
|
|
1519 |
return;
|
|
1520 |
vdev_dirty(tvd, VDD_DTL, txg);
|
|
1521 |
(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
|
|
1522 |
for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
|
|
1523 |
vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
|
|
1524 |
}
|
|
1525 |
}
|
|
1526 |
}
|
|
1527 |
|
|
1528 |
void
|
|
1529 |
vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
|
|
1530 |
{
|
|
1531 |
int c;
|
|
1532 |
vdev_stat_t *vs = &vd->vdev_stat;
|
|
1533 |
|
|
1534 |
for (c = 0; c < vd->vdev_children; c++)
|
|
1535 |
vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
|
|
1536 |
|
|
1537 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1538 |
|
|
1539 |
if (type == POOL_SCRUB_NONE) {
|
|
1540 |
/*
|
|
1541 |
* Update completion and end time. Leave everything else alone
|
|
1542 |
* so we can report what happened during the previous scrub.
|
|
1543 |
*/
|
|
1544 |
vs->vs_scrub_complete = complete;
|
|
1545 |
vs->vs_scrub_end = gethrestime_sec();
|
|
1546 |
} else {
|
|
1547 |
vs->vs_scrub_type = type;
|
|
1548 |
vs->vs_scrub_complete = 0;
|
|
1549 |
vs->vs_scrub_examined = 0;
|
|
1550 |
vs->vs_scrub_repaired = 0;
|
|
1551 |
vs->vs_scrub_errors = 0;
|
|
1552 |
vs->vs_scrub_start = gethrestime_sec();
|
|
1553 |
vs->vs_scrub_end = 0;
|
|
1554 |
}
|
|
1555 |
|
|
1556 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1557 |
}
|
|
1558 |
|
|
1559 |
/*
|
|
1560 |
* Report checksum errors that a vdev that didn't realize it made.
|
|
1561 |
* This can happen, for example, when RAID-Z combinatorial reconstruction
|
|
1562 |
* infers that one of its components returned bad data.
|
|
1563 |
*/
|
|
1564 |
void
|
|
1565 |
vdev_checksum_error(zio_t *zio, vdev_t *vd)
|
|
1566 |
{
|
|
1567 |
dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
|
|
1568 |
vdev_description(vd));
|
|
1569 |
|
|
1570 |
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
|
|
1571 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1572 |
vd->vdev_stat.vs_checksum_errors++;
|
|
1573 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1574 |
}
|
|
1575 |
}
|
|
1576 |
|
|
1577 |
/*
|
|
1578 |
* Update the in-core space usage stats for this vdev and the root vdev.
|
|
1579 |
*/
|
|
1580 |
void
|
|
1581 |
vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
|
|
1582 |
{
|
|
1583 |
ASSERT(vd == vd->vdev_top);
|
|
1584 |
|
|
1585 |
do {
|
|
1586 |
mutex_enter(&vd->vdev_stat_lock);
|
|
1587 |
vd->vdev_stat.vs_space += space_delta;
|
|
1588 |
vd->vdev_stat.vs_alloc += alloc_delta;
|
|
1589 |
mutex_exit(&vd->vdev_stat_lock);
|
|
1590 |
} while ((vd = vd->vdev_parent) != NULL);
|
|
1591 |
}
|
|
1592 |
|
|
1593 |
/*
|
|
1594 |
* Various knobs to tune a vdev.
|
|
1595 |
*/
|
|
1596 |
static vdev_knob_t vdev_knob[] = {
|
|
1597 |
{
|
|
1598 |
"cache_size",
|
|
1599 |
"size of the read-ahead cache",
|
|
1600 |
0,
|
|
1601 |
1ULL << 30,
|
|
1602 |
10ULL << 20,
|
|
1603 |
offsetof(struct vdev, vdev_cache.vc_size)
|
|
1604 |
},
|
|
1605 |
{
|
|
1606 |
"cache_bshift",
|
|
1607 |
"log2 of cache blocksize",
|
|
1608 |
SPA_MINBLOCKSHIFT,
|
|
1609 |
SPA_MAXBLOCKSHIFT,
|
|
1610 |
16,
|
|
1611 |
offsetof(struct vdev, vdev_cache.vc_bshift)
|
|
1612 |
},
|
|
1613 |
{
|
|
1614 |
"cache_max",
|
|
1615 |
"largest block size to cache",
|
|
1616 |
0,
|
|
1617 |
SPA_MAXBLOCKSIZE,
|
|
1618 |
1ULL << 14,
|
|
1619 |
offsetof(struct vdev, vdev_cache.vc_max)
|
|
1620 |
},
|
|
1621 |
{
|
|
1622 |
"min_pending",
|
|
1623 |
"minimum pending I/Os to the disk",
|
|
1624 |
1,
|
|
1625 |
10000,
|
|
1626 |
2,
|
|
1627 |
offsetof(struct vdev, vdev_queue.vq_min_pending)
|
|
1628 |
},
|
|
1629 |
{
|
|
1630 |
"max_pending",
|
|
1631 |
"maximum pending I/Os to the disk",
|
|
1632 |
1,
|
|
1633 |
10000,
|
|
1634 |
35,
|
|
1635 |
offsetof(struct vdev, vdev_queue.vq_max_pending)
|
|
1636 |
},
|
|
1637 |
{
|
|
1638 |
"agg_limit",
|
|
1639 |
"maximum size of aggregated I/Os",
|
|
1640 |
0,
|
|
1641 |
SPA_MAXBLOCKSIZE,
|
|
1642 |
SPA_MAXBLOCKSIZE,
|
|
1643 |
offsetof(struct vdev, vdev_queue.vq_agg_limit)
|
|
1644 |
},
|
|
1645 |
{
|
|
1646 |
"time_shift",
|
|
1647 |
"deadline = pri + (lbolt >> time_shift)",
|
|
1648 |
0,
|
|
1649 |
63,
|
|
1650 |
4,
|
|
1651 |
offsetof(struct vdev, vdev_queue.vq_time_shift)
|
|
1652 |
},
|
|
1653 |
{
|
|
1654 |
"ramp_rate",
|
|
1655 |
"exponential I/O issue ramp-up rate",
|
|
1656 |
1,
|
|
1657 |
10000,
|
|
1658 |
2,
|
|
1659 |
offsetof(struct vdev, vdev_queue.vq_ramp_rate)
|
|
1660 |
},
|
|
1661 |
};
|
|
1662 |
|
|
1663 |
vdev_knob_t *
|
|
1664 |
vdev_knob_next(vdev_knob_t *vk)
|
|
1665 |
{
|
|
1666 |
if (vk == NULL)
|
|
1667 |
return (vdev_knob);
|
|
1668 |
|
|
1669 |
if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
|
|
1670 |
return (NULL);
|
|
1671 |
|
|
1672 |
return (vk);
|
|
1673 |
}
|
|
1674 |
|
|
1675 |
/*
|
|
1676 |
* Mark a top-level vdev's config as dirty, placing it on the dirty list
|
|
1677 |
* so that it will be written out next time the vdev configuration is synced.
|
|
1678 |
* If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
|
|
1679 |
*/
|
|
1680 |
void
|
|
1681 |
vdev_config_dirty(vdev_t *vd)
|
|
1682 |
{
|
|
1683 |
spa_t *spa = vd->vdev_spa;
|
|
1684 |
vdev_t *rvd = spa->spa_root_vdev;
|
|
1685 |
int c;
|
|
1686 |
|
|
1687 |
if (vd == rvd) {
|
|
1688 |
for (c = 0; c < rvd->vdev_children; c++)
|
|
1689 |
vdev_config_dirty(rvd->vdev_child[c]);
|
|
1690 |
} else {
|
|
1691 |
ASSERT(vd == vd->vdev_top);
|
|
1692 |
|
|
1693 |
if (!vd->vdev_is_dirty) {
|
|
1694 |
list_insert_head(&spa->spa_dirty_list, vd);
|
|
1695 |
vd->vdev_is_dirty = B_TRUE;
|
|
1696 |
}
|
|
1697 |
}
|
|
1698 |
}
|
|
1699 |
|
|
1700 |
void
|
|
1701 |
vdev_config_clean(vdev_t *vd)
|
|
1702 |
{
|
|
1703 |
ASSERT(vd->vdev_is_dirty);
|
|
1704 |
|
|
1705 |
list_remove(&vd->vdev_spa->spa_dirty_list, vd);
|
|
1706 |
vd->vdev_is_dirty = B_FALSE;
|
|
1707 |
}
|
|
1708 |
|
|
1709 |
/*
|
|
1710 |
* Set a vdev's state, updating any parent's state as well.
|
|
1711 |
*/
|
|
1712 |
void
|
|
1713 |
vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
|
|
1714 |
{
|
|
1715 |
if (state == vd->vdev_state)
|
|
1716 |
return;
|
|
1717 |
|
|
1718 |
vd->vdev_state = state;
|
|
1719 |
vd->vdev_stat.vs_aux = aux;
|
|
1720 |
|
|
1721 |
if (vd->vdev_parent != NULL) {
|
|
1722 |
int c;
|
|
1723 |
int degraded = 0, faulted = 0;
|
|
1724 |
vdev_t *parent, *child;
|
|
1725 |
|
|
1726 |
parent = vd->vdev_parent;
|
|
1727 |
for (c = 0; c < parent->vdev_children; c++) {
|
|
1728 |
child = parent->vdev_child[c];
|
|
1729 |
if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
|
|
1730 |
faulted++;
|
|
1731 |
else if (child->vdev_state == VDEV_STATE_DEGRADED)
|
|
1732 |
degraded++;
|
|
1733 |
}
|
|
1734 |
|
|
1735 |
vd->vdev_parent->vdev_ops->vdev_op_state_change(
|
|
1736 |
vd->vdev_parent, faulted, degraded);
|
|
1737 |
}
|
|
1738 |
}
|