author | eschrock |
Mon, 30 Jan 2006 21:34:28 -0800 | |
changeset 1354 | 81359ee1ee63 |
parent 1175 | 759d20c7e57b |
child 1544 | 938876158511 |
permissions | -rw-r--r-- |
789 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
5 |
* Common Development and Distribution License, Version 1.0 only |
|
6 |
* (the "License"). You may not use this file except in compliance |
|
7 |
* with the License. |
|
8 |
* |
|
9 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
10 |
* or http://www.opensolaris.org/os/licensing. |
|
11 |
* See the License for the specific language governing permissions |
|
12 |
* and limitations under the License. |
|
13 |
* |
|
14 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
15 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
16 |
* If applicable, add the following below this CDDL HEADER, with the |
|
17 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
18 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
19 |
* |
|
20 |
* CDDL HEADER END |
|
21 |
*/ |
|
22 |
/* |
|
1354
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
23 |
* Copyright 2006 Sun Microsystems, Inc. All rights reserved. |
789 | 24 |
* Use is subject to license terms. |
25 |
*/ |
|
26 |
||
27 |
#pragma ident "%Z%%M% %I% %E% SMI" |
|
28 |
||
29 |
/* |
|
30 |
* This file contains all the routines used when modifying on-disk SPA state. |
|
31 |
* This includes opening, importing, destroying, exporting a pool, and syncing a |
|
32 |
* pool. |
|
33 |
*/ |
|
34 |
||
35 |
#include <sys/zfs_context.h> |
|
36 |
#include <sys/spa_impl.h> |
|
37 |
#include <sys/zio.h> |
|
38 |
#include <sys/zio_checksum.h> |
|
39 |
#include <sys/zio_compress.h> |
|
40 |
#include <sys/dmu.h> |
|
41 |
#include <sys/dmu_tx.h> |
|
42 |
#include <sys/zap.h> |
|
43 |
#include <sys/zil.h> |
|
44 |
#include <sys/vdev_impl.h> |
|
45 |
#include <sys/metaslab.h> |
|
46 |
#include <sys/uberblock_impl.h> |
|
47 |
#include <sys/txg.h> |
|
48 |
#include <sys/avl.h> |
|
49 |
#include <sys/dmu_traverse.h> |
|
50 |
#include <sys/unique.h> |
|
51 |
#include <sys/dsl_pool.h> |
|
52 |
#include <sys/dsl_dir.h> |
|
53 |
#include <sys/dsl_prop.h> |
|
54 |
#include <sys/fs/zfs.h> |
|
55 |
#include <sys/callb.h> |
|
56 |
||
57 |
static uint32_t spa_active_count; |
|
58 |
||
59 |
/* |
|
60 |
* ========================================================================== |
|
61 |
* SPA state manipulation (open/create/destroy/import/export) |
|
62 |
* ========================================================================== |
|
63 |
*/ |
|
64 |
||
65 |
/* |
|
66 |
* Activate an uninitialized pool. |
|
67 |
*/ |
|
68 |
static void |
|
69 |
spa_activate(spa_t *spa) |
|
70 |
{ |
|
71 |
int t; |
|
72 |
||
73 |
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); |
|
74 |
||
75 |
spa->spa_state = POOL_STATE_ACTIVE; |
|
76 |
||
77 |
spa->spa_normal_class = metaslab_class_create(); |
|
78 |
||
79 |
spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", |
|
80 |
4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); |
|
81 |
||
82 |
for (t = 0; t < ZIO_TYPES; t++) { |
|
83 |
spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", |
|
84 |
8, maxclsyspri, 50, INT_MAX, |
|
85 |
TASKQ_PREPOPULATE); |
|
86 |
spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", |
|
87 |
8, maxclsyspri, 50, INT_MAX, |
|
88 |
TASKQ_PREPOPULATE); |
|
89 |
} |
|
90 |
||
91 |
rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); |
|
92 |
||
93 |
list_create(&spa->spa_dirty_list, sizeof (vdev_t), |
|
94 |
offsetof(vdev_t, vdev_dirty_node)); |
|
95 |
||
96 |
txg_list_create(&spa->spa_vdev_txg_list, |
|
97 |
offsetof(struct vdev, vdev_txg_node)); |
|
98 |
} |
|
99 |
||
100 |
/* |
|
101 |
* Opposite of spa_activate(). |
|
102 |
*/ |
|
103 |
static void |
|
104 |
spa_deactivate(spa_t *spa) |
|
105 |
{ |
|
106 |
int t; |
|
107 |
||
108 |
ASSERT(spa->spa_sync_on == B_FALSE); |
|
109 |
ASSERT(spa->spa_dsl_pool == NULL); |
|
110 |
ASSERT(spa->spa_root_vdev == NULL); |
|
111 |
||
112 |
ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); |
|
113 |
||
114 |
txg_list_destroy(&spa->spa_vdev_txg_list); |
|
115 |
||
116 |
list_destroy(&spa->spa_dirty_list); |
|
117 |
||
118 |
rw_destroy(&spa->spa_traverse_lock); |
|
119 |
||
120 |
for (t = 0; t < ZIO_TYPES; t++) { |
|
121 |
taskq_destroy(spa->spa_zio_issue_taskq[t]); |
|
122 |
taskq_destroy(spa->spa_zio_intr_taskq[t]); |
|
123 |
spa->spa_zio_issue_taskq[t] = NULL; |
|
124 |
spa->spa_zio_intr_taskq[t] = NULL; |
|
125 |
} |
|
126 |
||
127 |
taskq_destroy(spa->spa_vdev_retry_taskq); |
|
128 |
spa->spa_vdev_retry_taskq = NULL; |
|
129 |
||
130 |
metaslab_class_destroy(spa->spa_normal_class); |
|
131 |
spa->spa_normal_class = NULL; |
|
132 |
||
133 |
spa->spa_state = POOL_STATE_UNINITIALIZED; |
|
134 |
} |
|
135 |
||
136 |
/* |
|
137 |
* Verify a pool configuration, and construct the vdev tree appropriately. This |
|
138 |
* will create all the necessary vdevs in the appropriate layout, with each vdev |
|
139 |
* in the CLOSED state. This will prep the pool before open/creation/import. |
|
140 |
* All vdev validation is done by the vdev_alloc() routine. |
|
141 |
*/ |
|
142 |
static vdev_t * |
|
143 |
spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) |
|
144 |
{ |
|
145 |
nvlist_t **child; |
|
146 |
uint_t c, children; |
|
147 |
vdev_t *vd; |
|
148 |
||
149 |
if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) |
|
150 |
return (NULL); |
|
151 |
||
152 |
if (vd->vdev_ops->vdev_op_leaf) |
|
153 |
return (vd); |
|
154 |
||
155 |
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, |
|
156 |
&child, &children) != 0) { |
|
157 |
vdev_free(vd); |
|
158 |
return (NULL); |
|
159 |
} |
|
160 |
||
161 |
for (c = 0; c < children; c++) { |
|
162 |
if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { |
|
163 |
vdev_free(vd); |
|
164 |
return (NULL); |
|
165 |
} |
|
166 |
} |
|
167 |
||
168 |
return (vd); |
|
169 |
} |
|
170 |
||
171 |
/* |
|
172 |
* Opposite of spa_load(). |
|
173 |
*/ |
|
174 |
static void |
|
175 |
spa_unload(spa_t *spa) |
|
176 |
{ |
|
177 |
/* |
|
178 |
* Stop syncing. |
|
179 |
*/ |
|
180 |
if (spa->spa_sync_on) { |
|
181 |
txg_sync_stop(spa->spa_dsl_pool); |
|
182 |
spa->spa_sync_on = B_FALSE; |
|
183 |
} |
|
184 |
||
185 |
/* |
|
186 |
* Wait for any outstanding prefetch I/O to complete. |
|
187 |
*/ |
|
188 |
spa_config_enter(spa, RW_WRITER); |
|
189 |
spa_config_exit(spa); |
|
190 |
||
191 |
/* |
|
192 |
* Close the dsl pool. |
|
193 |
*/ |
|
194 |
if (spa->spa_dsl_pool) { |
|
195 |
dsl_pool_close(spa->spa_dsl_pool); |
|
196 |
spa->spa_dsl_pool = NULL; |
|
197 |
} |
|
198 |
||
199 |
/* |
|
200 |
* Close all vdevs. |
|
201 |
*/ |
|
202 |
if (spa->spa_root_vdev) { |
|
203 |
vdev_free(spa->spa_root_vdev); |
|
204 |
spa->spa_root_vdev = NULL; |
|
205 |
} |
|
206 |
} |
|
207 |
||
208 |
/* |
|
209 |
* Load an existing storage pool, using the pool's builtin spa_config as a |
|
210 |
* source of configuration information. The 'readonly' flag will prevent us |
|
211 |
* from writing any updated state to disk, and can be use when testing a pool |
|
212 |
* for import. |
|
213 |
*/ |
|
214 |
static int |
|
215 |
spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) |
|
216 |
{ |
|
217 |
int error = 0; |
|
218 |
nvlist_t *nvroot = NULL; |
|
219 |
vdev_t *rvd; |
|
220 |
uberblock_t *ub = &spa->spa_uberblock; |
|
221 |
uint64_t pool_guid; |
|
222 |
zio_t *zio; |
|
223 |
||
224 |
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || |
|
225 |
nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) |
|
226 |
return (EINVAL); |
|
227 |
||
228 |
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, |
|
229 |
&spa->spa_config_txg); |
|
230 |
||
231 |
if (import && spa_guid_exists(pool_guid, 0)) |
|
232 |
return (EEXIST); |
|
233 |
||
234 |
/* |
|
235 |
* Parse the configuration into a vdev tree. |
|
236 |
*/ |
|
237 |
spa_config_enter(spa, RW_WRITER); |
|
238 |
rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); |
|
239 |
spa_config_exit(spa); |
|
240 |
||
241 |
if (rvd == NULL) |
|
242 |
return (EINVAL); |
|
243 |
||
244 |
spa->spa_root_vdev = rvd; |
|
245 |
ASSERT(spa_guid(spa) == pool_guid); |
|
246 |
||
247 |
/* |
|
248 |
* Try to open all vdevs, loading each label in the process. |
|
249 |
*/ |
|
250 |
if (vdev_open(rvd) != 0) |
|
251 |
return (ENXIO); |
|
252 |
||
253 |
/* |
|
254 |
* Find the best uberblock. |
|
255 |
*/ |
|
256 |
bzero(ub, sizeof (uberblock_t)); |
|
257 |
||
258 |
zio = zio_root(spa, NULL, NULL, |
|
259 |
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); |
|
260 |
vdev_uberblock_load(zio, rvd, ub); |
|
261 |
error = zio_wait(zio); |
|
262 |
||
263 |
/* |
|
264 |
* If we weren't able to find a single valid uberblock, return failure. |
|
265 |
*/ |
|
266 |
if (ub->ub_txg == 0) { |
|
267 |
dprintf("ub_txg is zero\n"); |
|
268 |
return (ENXIO); |
|
269 |
} |
|
270 |
||
271 |
/* |
|
272 |
* If the vdev guid sum doesn't match the uberblock, we have an |
|
273 |
* incomplete configuration. |
|
274 |
*/ |
|
275 |
if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { |
|
276 |
rvd->vdev_state = VDEV_STATE_CANT_OPEN; |
|
277 |
rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; |
|
278 |
dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", |
|
279 |
rvd->vdev_guid_sum, ub->ub_guid_sum); |
|
280 |
return (ENXIO); |
|
281 |
} |
|
282 |
||
283 |
/* |
|
284 |
* Initialize internal SPA structures. |
|
285 |
*/ |
|
286 |
spa->spa_state = POOL_STATE_ACTIVE; |
|
287 |
spa->spa_ubsync = spa->spa_uberblock; |
|
288 |
spa->spa_first_txg = spa_last_synced_txg(spa) + 1; |
|
289 |
spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); |
|
290 |
spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; |
|
291 |
||
292 |
VERIFY(zap_lookup(spa->spa_meta_objset, |
|
293 |
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, |
|
294 |
sizeof (uint64_t), 1, &spa->spa_config_object) == 0); |
|
295 |
||
296 |
if (!mosconfig) { |
|
297 |
dmu_buf_t *db; |
|
298 |
char *packed = NULL; |
|
299 |
size_t nvsize = 0; |
|
300 |
nvlist_t *newconfig = NULL; |
|
301 |
||
302 |
db = dmu_bonus_hold(spa->spa_meta_objset, |
|
303 |
spa->spa_config_object); |
|
304 |
dmu_buf_read(db); |
|
305 |
nvsize = *(uint64_t *)db->db_data; |
|
306 |
dmu_buf_rele(db); |
|
307 |
||
308 |
packed = kmem_alloc(nvsize, KM_SLEEP); |
|
309 |
error = dmu_read_canfail(spa->spa_meta_objset, |
|
310 |
spa->spa_config_object, 0, nvsize, packed); |
|
311 |
if (error == 0) |
|
312 |
error = nvlist_unpack(packed, nvsize, &newconfig, 0); |
|
313 |
kmem_free(packed, nvsize); |
|
314 |
||
315 |
if (error) |
|
316 |
return (ENXIO); |
|
317 |
||
318 |
spa_config_set(spa, newconfig); |
|
319 |
||
320 |
spa_unload(spa); |
|
321 |
spa_deactivate(spa); |
|
322 |
spa_activate(spa); |
|
323 |
||
324 |
return (spa_load(spa, newconfig, readonly, import, B_TRUE)); |
|
325 |
} |
|
326 |
||
327 |
VERIFY(zap_lookup(spa->spa_meta_objset, |
|
328 |
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, |
|
329 |
sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); |
|
330 |
||
331 |
/* |
|
332 |
* Load the vdev state for all top level vdevs. |
|
333 |
*/ |
|
334 |
if ((error = vdev_load(rvd, import)) != 0) |
|
335 |
return (error); |
|
336 |
||
337 |
/* |
|
338 |
* Propagate the leaf DTLs we just loaded all the way up the tree. |
|
339 |
*/ |
|
340 |
spa_config_enter(spa, RW_WRITER); |
|
341 |
vdev_dtl_reassess(rvd, 0, 0, B_FALSE); |
|
342 |
spa_config_exit(spa); |
|
343 |
||
344 |
/* |
|
345 |
* Check the state of the root vdev. If it can't be opened, it |
|
346 |
* indicates one or more toplevel vdevs are faulted. |
|
347 |
*/ |
|
348 |
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) |
|
349 |
return (ENXIO); |
|
350 |
||
351 |
/* |
|
352 |
* Claim log blocks that haven't been committed yet, and update all |
|
353 |
* top-level vdevs to sync any config changes found in vdev_load(). |
|
354 |
* This must all happen in a single txg. |
|
355 |
*/ |
|
356 |
if ((spa_mode & FWRITE) && !readonly) { |
|
357 |
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), |
|
358 |
spa_first_txg(spa)); |
|
359 |
dmu_objset_find(spa->spa_name, zil_claim, tx, 0); |
|
360 |
vdev_config_dirty(rvd); |
|
361 |
dmu_tx_commit(tx); |
|
362 |
||
363 |
spa->spa_sync_on = B_TRUE; |
|
364 |
txg_sync_start(spa->spa_dsl_pool); |
|
365 |
||
366 |
/* |
|
367 |
* Wait for all claims to sync. |
|
368 |
*/ |
|
369 |
txg_wait_synced(spa->spa_dsl_pool, 0); |
|
370 |
} |
|
371 |
||
372 |
return (0); |
|
373 |
} |
|
374 |
||
375 |
/* |
|
376 |
* Pool Open/Import |
|
377 |
* |
|
378 |
* The import case is identical to an open except that the configuration is sent |
|
379 |
* down from userland, instead of grabbed from the configuration cache. For the |
|
380 |
* case of an open, the pool configuration will exist in the |
|
381 |
* POOL_STATE_UNITIALIZED state. |
|
382 |
* |
|
383 |
* The stats information (gen/count/ustats) is used to gather vdev statistics at |
|
384 |
* the same time open the pool, without having to keep around the spa_t in some |
|
385 |
* ambiguous state. |
|
386 |
*/ |
|
387 |
static int |
|
388 |
spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) |
|
389 |
{ |
|
390 |
spa_t *spa; |
|
391 |
int error; |
|
392 |
int loaded = B_FALSE; |
|
393 |
int locked = B_FALSE; |
|
394 |
||
395 |
*spapp = NULL; |
|
396 |
||
397 |
/* |
|
398 |
* As disgusting as this is, we need to support recursive calls to this |
|
399 |
* function because dsl_dir_open() is called during spa_load(), and ends |
|
400 |
* up calling spa_open() again. The real fix is to figure out how to |
|
401 |
* avoid dsl_dir_open() calling this in the first place. |
|
402 |
*/ |
|
403 |
if (mutex_owner(&spa_namespace_lock) != curthread) { |
|
404 |
mutex_enter(&spa_namespace_lock); |
|
405 |
locked = B_TRUE; |
|
406 |
} |
|
407 |
||
408 |
if ((spa = spa_lookup(pool)) == NULL) { |
|
409 |
if (locked) |
|
410 |
mutex_exit(&spa_namespace_lock); |
|
411 |
return (ENOENT); |
|
412 |
} |
|
413 |
if (spa->spa_state == POOL_STATE_UNINITIALIZED) { |
|
414 |
||
415 |
spa_activate(spa); |
|
416 |
||
417 |
error = spa_load(spa, spa->spa_config, |
|
418 |
B_FALSE, B_FALSE, B_FALSE); |
|
419 |
||
420 |
if (error == EBADF) { |
|
421 |
/* |
|
422 |
* If vdev_load() returns EBADF, it indicates that one |
|
423 |
* of the vdevs indicates that the pool has been |
|
424 |
* exported or destroyed. If this is the case, the |
|
425 |
* config cache is out of sync and we should remove the |
|
426 |
* pool from the namespace. |
|
427 |
*/ |
|
428 |
spa_unload(spa); |
|
429 |
spa_deactivate(spa); |
|
430 |
spa_remove(spa); |
|
431 |
spa_config_sync(); |
|
432 |
if (locked) |
|
433 |
mutex_exit(&spa_namespace_lock); |
|
434 |
return (ENOENT); |
|
435 |
} if (error) { |
|
436 |
/* |
|
437 |
* We can't open the pool, but we still have useful |
|
438 |
* information: the state of each vdev after the |
|
439 |
* attempted vdev_open(). Return this to the user. |
|
440 |
*/ |
|
441 |
if (config != NULL && spa->spa_root_vdev != NULL) |
|
442 |
*config = spa_config_generate(spa, NULL, -1ULL, |
|
443 |
B_TRUE); |
|
444 |
spa_unload(spa); |
|
445 |
spa_deactivate(spa); |
|
446 |
if (locked) |
|
447 |
mutex_exit(&spa_namespace_lock); |
|
448 |
*spapp = NULL; |
|
449 |
return (error); |
|
450 |
} |
|
451 |
||
452 |
loaded = B_TRUE; |
|
453 |
} |
|
454 |
||
455 |
spa_open_ref(spa, tag); |
|
456 |
if (locked) |
|
457 |
mutex_exit(&spa_namespace_lock); |
|
458 |
||
459 |
*spapp = spa; |
|
460 |
||
461 |
if (config != NULL) { |
|
462 |
spa_config_enter(spa, RW_READER); |
|
463 |
*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); |
|
464 |
spa_config_exit(spa); |
|
465 |
} |
|
466 |
||
467 |
/* |
|
468 |
* If we just loaded the pool, resilver anything that's out of date. |
|
469 |
*/ |
|
470 |
if (loaded && (spa_mode & FWRITE)) |
|
471 |
VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); |
|
472 |
||
473 |
return (0); |
|
474 |
} |
|
475 |
||
476 |
int |
|
477 |
spa_open(const char *name, spa_t **spapp, void *tag) |
|
478 |
{ |
|
479 |
return (spa_open_common(name, spapp, tag, NULL)); |
|
480 |
} |
|
481 |
||
482 |
int |
|
483 |
spa_get_stats(const char *name, nvlist_t **config) |
|
484 |
{ |
|
485 |
int error; |
|
486 |
spa_t *spa; |
|
487 |
||
488 |
*config = NULL; |
|
489 |
error = spa_open_common(name, &spa, FTAG, config); |
|
490 |
||
491 |
if (spa != NULL) |
|
492 |
spa_close(spa, FTAG); |
|
493 |
||
494 |
return (error); |
|
495 |
} |
|
496 |
||
497 |
/* |
|
498 |
* Pool Creation |
|
499 |
*/ |
|
500 |
int |
|
501 |
spa_create(const char *pool, nvlist_t *nvroot, char *altroot) |
|
502 |
{ |
|
503 |
spa_t *spa; |
|
504 |
dsl_pool_t *dp; |
|
505 |
dmu_tx_t *tx; |
|
506 |
int error; |
|
507 |
uint64_t txg = TXG_INITIAL; |
|
508 |
||
509 |
/* |
|
510 |
* If this pool already exists, return failure. |
|
511 |
*/ |
|
512 |
mutex_enter(&spa_namespace_lock); |
|
513 |
if (spa_lookup(pool) != NULL) { |
|
514 |
mutex_exit(&spa_namespace_lock); |
|
515 |
return (EEXIST); |
|
516 |
} |
|
517 |
spa = spa_add(pool); |
|
518 |
||
519 |
/* |
|
520 |
* Allocate a new spa_t structure. |
|
521 |
*/ |
|
522 |
spa_activate(spa); |
|
523 |
||
524 |
spa->spa_uberblock.ub_txg = txg - 1; |
|
525 |
spa->spa_ubsync = spa->spa_uberblock; |
|
526 |
||
527 |
error = spa_vdev_add(spa, nvroot); |
|
528 |
||
529 |
if (error) { |
|
530 |
spa_unload(spa); |
|
531 |
spa_deactivate(spa); |
|
532 |
spa_remove(spa); |
|
533 |
mutex_exit(&spa_namespace_lock); |
|
534 |
return (error); |
|
535 |
} |
|
536 |
||
537 |
if (altroot != NULL) { |
|
538 |
spa->spa_root = spa_strdup(altroot); |
|
539 |
atomic_add_32(&spa_active_count, 1); |
|
540 |
} |
|
541 |
||
542 |
spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); |
|
543 |
spa->spa_meta_objset = dp->dp_meta_objset; |
|
544 |
||
545 |
tx = dmu_tx_create_assigned(dp, txg); |
|
546 |
||
547 |
/* |
|
548 |
* Create the pool config object. |
|
549 |
*/ |
|
550 |
spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, |
|
551 |
DMU_OT_PACKED_NVLIST, 1 << 14, |
|
552 |
DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); |
|
553 |
||
554 |
VERIFY(zap_add(spa->spa_meta_objset, |
|
555 |
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, |
|
556 |
sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); |
|
557 |
||
558 |
/* |
|
559 |
* Create the deferred-free bplist object. Turn off compression |
|
560 |
* because sync-to-convergence takes longer if the blocksize |
|
561 |
* keeps changing. |
|
562 |
*/ |
|
563 |
spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, |
|
564 |
1 << 14, tx); |
|
565 |
dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, |
|
566 |
ZIO_COMPRESS_OFF, tx); |
|
567 |
||
568 |
VERIFY(zap_add(spa->spa_meta_objset, |
|
569 |
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, |
|
570 |
sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); |
|
571 |
||
572 |
dmu_tx_commit(tx); |
|
573 |
||
574 |
spa->spa_sync_on = B_TRUE; |
|
575 |
txg_sync_start(spa->spa_dsl_pool); |
|
576 |
||
577 |
/* |
|
578 |
* We explicitly wait for the first transaction to complete so that our |
|
579 |
* bean counters are appropriately updated. |
|
580 |
*/ |
|
581 |
txg_wait_synced(spa->spa_dsl_pool, txg); |
|
582 |
||
583 |
spa_config_sync(); |
|
584 |
||
585 |
mutex_exit(&spa_namespace_lock); |
|
586 |
||
587 |
return (0); |
|
588 |
} |
|
589 |
||
590 |
/* |
|
591 |
* Import the given pool into the system. We set up the necessary spa_t and |
|
592 |
* then call spa_load() to do the dirty work. |
|
593 |
*/ |
|
594 |
int |
|
595 |
spa_import(const char *pool, nvlist_t *config, char *altroot) |
|
596 |
{ |
|
597 |
spa_t *spa; |
|
598 |
int error; |
|
599 |
||
600 |
if (!(spa_mode & FWRITE)) |
|
601 |
return (EROFS); |
|
602 |
||
603 |
/* |
|
604 |
* If a pool with this name exists, return failure. |
|
605 |
*/ |
|
606 |
mutex_enter(&spa_namespace_lock); |
|
607 |
if (spa_lookup(pool) != NULL) { |
|
608 |
mutex_exit(&spa_namespace_lock); |
|
609 |
return (EEXIST); |
|
610 |
} |
|
611 |
||
612 |
/* |
|
613 |
* Create an initialize the spa structure |
|
614 |
*/ |
|
615 |
spa = spa_add(pool); |
|
616 |
spa_activate(spa); |
|
617 |
||
618 |
/* |
|
619 |
* Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig |
|
620 |
* so that we don't try to open the pool if the config is damaged. |
|
621 |
*/ |
|
622 |
error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); |
|
623 |
||
624 |
if (error) { |
|
625 |
spa_unload(spa); |
|
626 |
spa_deactivate(spa); |
|
627 |
spa_remove(spa); |
|
628 |
mutex_exit(&spa_namespace_lock); |
|
629 |
return (error); |
|
630 |
} |
|
631 |
||
632 |
/* |
|
633 |
* Set the alternate root, if there is one. |
|
634 |
*/ |
|
635 |
if (altroot != NULL) { |
|
636 |
atomic_add_32(&spa_active_count, 1); |
|
637 |
spa->spa_root = spa_strdup(altroot); |
|
638 |
} |
|
639 |
||
640 |
/* |
|
641 |
* Initialize the config based on the in-core state. |
|
642 |
*/ |
|
643 |
config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); |
|
644 |
||
645 |
spa_config_set(spa, config); |
|
646 |
||
647 |
/* |
|
648 |
* Sync the configuration cache. |
|
649 |
*/ |
|
650 |
spa_config_sync(); |
|
651 |
||
652 |
mutex_exit(&spa_namespace_lock); |
|
653 |
||
654 |
/* |
|
655 |
* Resilver anything that's out of date. |
|
656 |
*/ |
|
657 |
if (spa_mode & FWRITE) |
|
658 |
VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); |
|
659 |
||
660 |
return (0); |
|
661 |
} |
|
662 |
||
663 |
/* |
|
664 |
* This (illegal) pool name is used when temporarily importing a spa_t in order |
|
665 |
* to get the vdev stats associated with the imported devices. |
|
666 |
*/ |
|
667 |
#define TRYIMPORT_NAME "$import" |
|
668 |
||
669 |
nvlist_t * |
|
670 |
spa_tryimport(nvlist_t *tryconfig) |
|
671 |
{ |
|
672 |
nvlist_t *config = NULL; |
|
673 |
char *poolname; |
|
674 |
spa_t *spa; |
|
675 |
uint64_t state; |
|
676 |
||
677 |
if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) |
|
678 |
return (NULL); |
|
679 |
||
680 |
if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) |
|
681 |
return (NULL); |
|
682 |
||
683 |
mutex_enter(&spa_namespace_lock); |
|
684 |
spa = spa_add(TRYIMPORT_NAME); |
|
685 |
||
686 |
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); |
|
687 |
||
688 |
/* |
|
689 |
* Initialize the spa_t structure. |
|
690 |
*/ |
|
691 |
spa_activate(spa); |
|
692 |
||
693 |
/* |
|
694 |
* Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig |
|
695 |
* so we don't try to open the pool if the config is damaged. |
|
696 |
*/ |
|
697 |
(void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); |
|
698 |
||
699 |
/* |
|
700 |
* If 'tryconfig' was at least parsable, return the current config. |
|
701 |
*/ |
|
702 |
if (spa->spa_root_vdev != NULL) { |
|
703 |
config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); |
|
704 |
VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, |
|
705 |
poolname) == 0); |
|
706 |
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, |
|
707 |
state) == 0); |
|
708 |
} |
|
709 |
||
710 |
spa_unload(spa); |
|
711 |
spa_deactivate(spa); |
|
712 |
spa_remove(spa); |
|
713 |
mutex_exit(&spa_namespace_lock); |
|
714 |
||
715 |
return (config); |
|
716 |
} |
|
717 |
||
718 |
/* |
|
719 |
* Pool export/destroy |
|
720 |
* |
|
721 |
* The act of destroying or exporting a pool is very simple. We make sure there |
|
722 |
* is no more pending I/O and any references to the pool are gone. Then, we |
|
723 |
* update the pool state and sync all the labels to disk, removing the |
|
724 |
* configuration from the cache afterwards. |
|
725 |
*/ |
|
726 |
static int |
|
727 |
spa_export_common(char *pool, int new_state) |
|
728 |
{ |
|
729 |
spa_t *spa; |
|
730 |
||
731 |
if (!(spa_mode & FWRITE)) |
|
732 |
return (EROFS); |
|
733 |
||
734 |
mutex_enter(&spa_namespace_lock); |
|
735 |
if ((spa = spa_lookup(pool)) == NULL) { |
|
736 |
mutex_exit(&spa_namespace_lock); |
|
737 |
return (ENOENT); |
|
738 |
} |
|
739 |
||
740 |
/* |
|
741 |
* The pool will be in core if it's openable, |
|
742 |
* in which case we can modify its state. |
|
743 |
*/ |
|
744 |
if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { |
|
745 |
/* |
|
746 |
* Objsets may be open only because they're dirty, so we |
|
747 |
* have to force it to sync before checking spa_refcnt. |
|
748 |
*/ |
|
749 |
spa_scrub_suspend(spa); |
|
750 |
txg_wait_synced(spa->spa_dsl_pool, 0); |
|
751 |
||
752 |
if (!spa_refcount_zero(spa)) { |
|
753 |
spa_scrub_resume(spa); |
|
754 |
mutex_exit(&spa_namespace_lock); |
|
755 |
return (EBUSY); |
|
756 |
} |
|
757 |
||
758 |
/* |
|
759 |
* Update the pool state. |
|
760 |
*/ |
|
761 |
spa->spa_state = new_state; |
|
762 |
||
763 |
spa_scrub_resume(spa); |
|
764 |
VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); |
|
765 |
||
766 |
if (spa->spa_root != NULL) |
|
767 |
atomic_add_32(&spa_active_count, -1); |
|
768 |
||
769 |
/* |
|
770 |
* We want this to be reflected on every label, |
|
771 |
* so mark them all dirty. spa_unload() will do the |
|
772 |
* final sync that pushes these changes out. |
|
773 |
*/ |
|
774 |
vdev_config_dirty(spa->spa_root_vdev); |
|
775 |
} |
|
776 |
||
777 |
if (spa->spa_state != POOL_STATE_UNINITIALIZED) { |
|
778 |
spa_unload(spa); |
|
779 |
spa_deactivate(spa); |
|
780 |
} |
|
781 |
||
782 |
spa_remove(spa); |
|
783 |
spa_config_sync(); |
|
784 |
mutex_exit(&spa_namespace_lock); |
|
785 |
||
786 |
return (0); |
|
787 |
} |
|
788 |
||
789 |
/* |
|
790 |
* Destroy a storage pool. |
|
791 |
*/ |
|
792 |
int |
|
793 |
spa_destroy(char *pool) |
|
794 |
{ |
|
795 |
return (spa_export_common(pool, POOL_STATE_DESTROYED)); |
|
796 |
} |
|
797 |
||
798 |
/* |
|
799 |
* Export a storage pool. |
|
800 |
*/ |
|
801 |
int |
|
802 |
spa_export(char *pool) |
|
803 |
{ |
|
804 |
return (spa_export_common(pool, POOL_STATE_EXPORTED)); |
|
805 |
} |
|
806 |
||
807 |
/* |
|
808 |
* ========================================================================== |
|
809 |
* Device manipulation |
|
810 |
* ========================================================================== |
|
811 |
*/ |
|
812 |
||
813 |
/* |
|
814 |
* Add capacity to a storage pool. |
|
815 |
*/ |
|
816 |
int |
|
817 |
spa_vdev_add(spa_t *spa, nvlist_t *nvroot) |
|
818 |
{ |
|
819 |
uint64_t txg; |
|
820 |
int c, error; |
|
821 |
vdev_t *rvd = spa->spa_root_vdev; |
|
822 |
vdev_t *vd; |
|
823 |
||
824 |
txg = spa_vdev_enter(spa); |
|
825 |
||
826 |
vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); |
|
827 |
||
828 |
if (vd == NULL) |
|
829 |
return (spa_vdev_exit(spa, vd, txg, EINVAL)); |
|
830 |
||
831 |
if (rvd == NULL) /* spa_create() */ |
|
832 |
spa->spa_root_vdev = rvd = vd; |
|
833 |
||
834 |
if ((error = vdev_create(vd, txg)) != 0) |
|
835 |
return (spa_vdev_exit(spa, vd, txg, error)); |
|
836 |
||
837 |
/* |
|
838 |
* Transfer each top-level vdev from the temporary root |
|
839 |
* to the spa's root and initialize its metaslabs. |
|
840 |
*/ |
|
841 |
for (c = 0; c < vd->vdev_children; c++) { |
|
842 |
vdev_t *tvd = vd->vdev_child[c]; |
|
843 |
if (vd != rvd) { |
|
844 |
vdev_remove_child(vd, tvd); |
|
845 |
tvd->vdev_id = rvd->vdev_children; |
|
846 |
vdev_add_child(rvd, tvd); |
|
847 |
} |
|
848 |
vdev_init(tvd, txg); |
|
849 |
vdev_config_dirty(tvd); |
|
850 |
} |
|
851 |
||
852 |
/* |
|
853 |
* Update the config based on the new in-core state. |
|
854 |
*/ |
|
855 |
spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); |
|
856 |
||
857 |
return (spa_vdev_exit(spa, vd, txg, 0)); |
|
858 |
} |
|
859 |
||
860 |
/* |
|
861 |
* Attach a device to a mirror. The arguments are the path to any device |
|
862 |
* in the mirror, and the nvroot for the new device. If the path specifies |
|
863 |
* a device that is not mirrored, we automatically insert the mirror vdev. |
|
864 |
* |
|
865 |
* If 'replacing' is specified, the new device is intended to replace the |
|
866 |
* existing device; in this case the two devices are made into their own |
|
867 |
* mirror using the 'replacing' vdev, which is functionally idendical to |
|
868 |
* the mirror vdev (it actually reuses all the same ops) but has a few |
|
869 |
* extra rules: you can't attach to it after it's been created, and upon |
|
870 |
* completion of resilvering, the first disk (the one being replaced) |
|
871 |
* is automatically detached. |
|
872 |
*/ |
|
873 |
int |
|
874 |
spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) |
|
875 |
{ |
|
876 |
uint64_t txg, open_txg; |
|
877 |
int error; |
|
878 |
vdev_t *rvd = spa->spa_root_vdev; |
|
879 |
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; |
|
880 |
vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; |
|
881 |
||
882 |
txg = spa_vdev_enter(spa); |
|
883 |
||
884 |
oldvd = vdev_lookup_by_path(rvd, path); |
|
885 |
||
886 |
if (oldvd == NULL) |
|
887 |
return (spa_vdev_exit(spa, NULL, txg, ENODEV)); |
|
888 |
||
889 |
pvd = oldvd->vdev_parent; |
|
890 |
||
891 |
/* |
|
892 |
* The parent must be a mirror or the root, unless we're replacing; |
|
893 |
* in that case, the parent can be anything but another replacing vdev. |
|
894 |
*/ |
|
895 |
if (pvd->vdev_ops != &vdev_mirror_ops && |
|
896 |
pvd->vdev_ops != &vdev_root_ops && |
|
897 |
(!replacing || pvd->vdev_ops == &vdev_replacing_ops)) |
|
898 |
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); |
|
899 |
||
900 |
newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); |
|
901 |
||
902 |
if (newrootvd == NULL || newrootvd->vdev_children != 1) |
|
903 |
return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); |
|
904 |
||
905 |
newvd = newrootvd->vdev_child[0]; |
|
906 |
||
907 |
if (!newvd->vdev_ops->vdev_op_leaf) |
|
908 |
return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); |
|
909 |
||
910 |
if ((error = vdev_create(newrootvd, txg)) != 0) |
|
911 |
return (spa_vdev_exit(spa, newrootvd, txg, error)); |
|
912 |
||
1175
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
913 |
/* |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
914 |
* Compare the new device size with the replaceable/attachable |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
915 |
* device size. |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
916 |
*/ |
759d20c7e57b
6366265 attach/replace should allow a new device size at least the min of all devs in a mirror/raidz
lling
parents:
797
diff
changeset
|
917 |
if (newvd->vdev_psize < vdev_get_rsize(oldvd)) |
789 | 918 |
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); |
919 |
||
920 |
if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) |
|
921 |
return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); |
|
922 |
||
923 |
/* |
|
924 |
* If this is an in-place replacement, update oldvd's path and devid |
|
925 |
* to make it distinguishable from newvd, and unopenable from now on. |
|
926 |
*/ |
|
927 |
if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { |
|
928 |
spa_strfree(oldvd->vdev_path); |
|
929 |
oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, |
|
930 |
KM_SLEEP); |
|
931 |
(void) sprintf(oldvd->vdev_path, "%s/%s", |
|
932 |
newvd->vdev_path, "old"); |
|
933 |
if (oldvd->vdev_devid != NULL) { |
|
934 |
spa_strfree(oldvd->vdev_devid); |
|
935 |
oldvd->vdev_devid = NULL; |
|
936 |
} |
|
937 |
} |
|
938 |
||
939 |
/* |
|
940 |
* If the parent is not a mirror, or if we're replacing, |
|
941 |
* insert the new mirror/replacing vdev above oldvd. |
|
942 |
*/ |
|
943 |
if (pvd->vdev_ops != pvops) |
|
944 |
pvd = vdev_add_parent(oldvd, pvops); |
|
945 |
||
946 |
ASSERT(pvd->vdev_top->vdev_parent == rvd); |
|
947 |
ASSERT(pvd->vdev_ops == pvops); |
|
948 |
ASSERT(oldvd->vdev_parent == pvd); |
|
949 |
||
950 |
/* |
|
951 |
* Extract the new device from its root and add it to pvd. |
|
952 |
*/ |
|
953 |
vdev_remove_child(newrootvd, newvd); |
|
954 |
newvd->vdev_id = pvd->vdev_children; |
|
955 |
vdev_add_child(pvd, newvd); |
|
956 |
||
957 |
tvd = newvd->vdev_top; |
|
958 |
ASSERT(pvd->vdev_top == tvd); |
|
959 |
ASSERT(tvd->vdev_parent == rvd); |
|
960 |
||
961 |
/* |
|
962 |
* Update the config based on the new in-core state. |
|
963 |
*/ |
|
964 |
spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); |
|
965 |
||
966 |
vdev_config_dirty(tvd); |
|
967 |
||
968 |
/* |
|
969 |
* Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate |
|
970 |
* upward when spa_vdev_exit() calls vdev_dtl_reassess(). |
|
971 |
*/ |
|
972 |
open_txg = txg + TXG_CONCURRENT_STATES - 1; |
|
973 |
||
974 |
mutex_enter(&newvd->vdev_dtl_lock); |
|
975 |
space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, |
|
976 |
open_txg - TXG_INITIAL + 1); |
|
977 |
mutex_exit(&newvd->vdev_dtl_lock); |
|
978 |
||
979 |
/* |
|
980 |
* Mark newvd's DTL dirty in this txg. |
|
981 |
*/ |
|
982 |
vdev_dirty(tvd, VDD_DTL, txg); |
|
983 |
(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); |
|
984 |
||
985 |
dprintf("attached %s, replacing=%d\n", path, replacing); |
|
986 |
||
987 |
(void) spa_vdev_exit(spa, newrootvd, open_txg, 0); |
|
988 |
||
989 |
/* |
|
990 |
* Kick off a resilver to update newvd. |
|
991 |
*/ |
|
992 |
VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); |
|
993 |
||
994 |
return (0); |
|
995 |
} |
|
996 |
||
997 |
/* |
|
998 |
* Detach a device from a mirror or replacing vdev. |
|
999 |
* If 'replace_done' is specified, only detach if the parent |
|
1000 |
* is a replacing vdev. |
|
1001 |
*/ |
|
1002 |
int |
|
1003 |
spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) |
|
1004 |
{ |
|
1005 |
uint64_t txg; |
|
1006 |
int c, t, error; |
|
1007 |
vdev_t *rvd = spa->spa_root_vdev; |
|
1008 |
vdev_t *vd, *pvd, *cvd, *tvd; |
|
1009 |
||
1010 |
txg = spa_vdev_enter(spa); |
|
1011 |
||
1012 |
vd = vdev_lookup_by_path(rvd, path); |
|
1013 |
||
1014 |
if (vd == NULL) |
|
1015 |
return (spa_vdev_exit(spa, NULL, txg, ENODEV)); |
|
1016 |
||
1017 |
if (guid != 0 && vd->vdev_guid != guid) |
|
1018 |
return (spa_vdev_exit(spa, NULL, txg, ENODEV)); |
|
1019 |
||
1020 |
pvd = vd->vdev_parent; |
|
1021 |
||
1022 |
/* |
|
1023 |
* If replace_done is specified, only remove this device if it's |
|
1024 |
* the first child of a replacing vdev. |
|
1025 |
*/ |
|
1026 |
if (replace_done && |
|
1027 |
(vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) |
|
1028 |
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); |
|
1029 |
||
1030 |
/* |
|
1031 |
* Only mirror and replacing vdevs support detach. |
|
1032 |
*/ |
|
1033 |
if (pvd->vdev_ops != &vdev_replacing_ops && |
|
1034 |
pvd->vdev_ops != &vdev_mirror_ops) |
|
1035 |
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); |
|
1036 |
||
1037 |
/* |
|
1038 |
* If there's only one replica, you can't detach it. |
|
1039 |
*/ |
|
1040 |
if (pvd->vdev_children <= 1) |
|
1041 |
return (spa_vdev_exit(spa, NULL, txg, EBUSY)); |
|
1042 |
||
1043 |
/* |
|
1044 |
* If all siblings have non-empty DTLs, this device may have the only |
|
1045 |
* valid copy of the data, which means we cannot safely detach it. |
|
1046 |
* |
|
1047 |
* XXX -- as in the vdev_offline() case, we really want a more |
|
1048 |
* precise DTL check. |
|
1049 |
*/ |
|
1050 |
for (c = 0; c < pvd->vdev_children; c++) { |
|
1051 |
uint64_t dirty; |
|
1052 |
||
1053 |
cvd = pvd->vdev_child[c]; |
|
1054 |
if (cvd == vd) |
|
1055 |
continue; |
|
1056 |
if (vdev_is_dead(cvd)) |
|
1057 |
continue; |
|
1058 |
mutex_enter(&cvd->vdev_dtl_lock); |
|
1059 |
dirty = cvd->vdev_dtl_map.sm_space | |
|
1060 |
cvd->vdev_dtl_scrub.sm_space; |
|
1061 |
mutex_exit(&cvd->vdev_dtl_lock); |
|
1062 |
if (!dirty) |
|
1063 |
break; |
|
1064 |
} |
|
1065 |
if (c == pvd->vdev_children) |
|
1066 |
return (spa_vdev_exit(spa, NULL, txg, EBUSY)); |
|
1067 |
||
1068 |
/* |
|
1069 |
* Erase the disk labels so the disk can be used for other things. |
|
1070 |
* This must be done after all other error cases are handled, |
|
1071 |
* but before we disembowel vd (so we can still do I/O to it). |
|
1072 |
* But if we can't do it, don't treat the error as fatal -- |
|
1073 |
* it may be that the unwritability of the disk is the reason |
|
1074 |
* it's being detached! |
|
1075 |
*/ |
|
1076 |
error = vdev_label_init(vd, 0); |
|
1077 |
if (error) |
|
1078 |
dprintf("unable to erase labels on %s\n", vdev_description(vd)); |
|
1079 |
||
1080 |
/* |
|
1081 |
* Remove vd from its parent and compact the parent's children. |
|
1082 |
*/ |
|
1083 |
vdev_remove_child(pvd, vd); |
|
1084 |
vdev_compact_children(pvd); |
|
1085 |
||
1086 |
/* |
|
1087 |
* Remember one of the remaining children so we can get tvd below. |
|
1088 |
*/ |
|
1089 |
cvd = pvd->vdev_child[0]; |
|
1090 |
||
1091 |
/* |
|
1092 |
* If the parent mirror/replacing vdev only has one child, |
|
1093 |
* the parent is no longer needed. Remove it from the tree. |
|
1094 |
*/ |
|
1095 |
if (pvd->vdev_children == 1) |
|
1096 |
vdev_remove_parent(cvd); |
|
1097 |
||
1098 |
/* |
|
1099 |
* We don't set tvd until now because the parent we just removed |
|
1100 |
* may have been the previous top-level vdev. |
|
1101 |
*/ |
|
1102 |
tvd = cvd->vdev_top; |
|
1103 |
ASSERT(tvd->vdev_parent == rvd); |
|
1104 |
||
1105 |
/* |
|
1106 |
* Reopen this top-level vdev to reassess health after detach. |
|
1107 |
*/ |
|
1108 |
vdev_reopen(tvd, NULL); |
|
1109 |
||
1110 |
/* |
|
1111 |
* If the device we just detached was smaller than the others, |
|
1112 |
* it may be possible to add metaslabs (i.e. grow the pool). |
|
1113 |
*/ |
|
1114 |
vdev_metaslab_init(tvd, txg); |
|
1115 |
||
1116 |
/* |
|
1117 |
* Update the config based on the new in-core state. |
|
1118 |
*/ |
|
1119 |
spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); |
|
1120 |
||
1121 |
vdev_config_dirty(tvd); |
|
1122 |
||
1123 |
/* |
|
1124 |
* Mark vd's DTL as dirty in this txg. |
|
1125 |
* vdev_dtl_sync() will see that vd->vdev_detached is set |
|
1126 |
* and free vd's DTL object in syncing context. |
|
1127 |
* But first make sure we're not on any *other* txg's DTL list, |
|
1128 |
* to prevent vd from being accessed after it's freed. |
|
1129 |
*/ |
|
1130 |
vdev_dirty(tvd, VDD_DTL, txg); |
|
1131 |
vd->vdev_detached = B_TRUE; |
|
1132 |
for (t = 0; t < TXG_SIZE; t++) |
|
1133 |
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); |
|
1134 |
(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); |
|
1135 |
||
1136 |
dprintf("detached %s\n", path); |
|
1137 |
||
1138 |
return (spa_vdev_exit(spa, vd, txg, 0)); |
|
1139 |
} |
|
1140 |
||
1141 |
/* |
|
1142 |
* If there are any replacing vdevs that have finished replacing, detach them. |
|
1143 |
* We can't hold the config lock across detaches, so we lock the config, |
|
1144 |
* build a list of candidates, unlock the config, and try each candidate. |
|
1145 |
*/ |
|
1146 |
typedef struct vdev_detach_link { |
|
1147 |
char *vdl_path; |
|
1148 |
uint64_t vdl_guid; |
|
1149 |
list_node_t vdl_node; |
|
1150 |
} vdev_detach_link_t; |
|
1151 |
||
1152 |
static void |
|
1153 |
spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) |
|
1154 |
{ |
|
1155 |
int c; |
|
1156 |
||
1157 |
for (c = 0; c < vd->vdev_children; c++) |
|
1158 |
spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); |
|
1159 |
||
1160 |
if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { |
|
1161 |
vdev_t *cvd0 = vd->vdev_child[0]; |
|
1162 |
vdev_t *cvd1 = vd->vdev_child[1]; |
|
1163 |
vdev_detach_link_t *vdl; |
|
1164 |
int dirty1; |
|
1165 |
||
1166 |
mutex_enter(&cvd1->vdev_dtl_lock); |
|
1167 |
dirty1 = cvd1->vdev_dtl_map.sm_space | |
|
1168 |
cvd1->vdev_dtl_scrub.sm_space; |
|
1169 |
mutex_exit(&cvd1->vdev_dtl_lock); |
|
1170 |
||
1171 |
if (!dirty1) { |
|
1172 |
vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); |
|
1173 |
vdl->vdl_path = spa_strdup(cvd0->vdev_path); |
|
1174 |
vdl->vdl_guid = cvd0->vdev_guid; |
|
1175 |
list_insert_tail(l, vdl); |
|
1176 |
} |
|
1177 |
} |
|
1178 |
} |
|
1179 |
||
1180 |
void |
|
1181 |
spa_vdev_replace_done(spa_t *spa) |
|
1182 |
{ |
|
1183 |
vdev_detach_link_t *vdl; |
|
1184 |
list_t vdlist; |
|
1185 |
||
1186 |
list_create(&vdlist, sizeof (vdev_detach_link_t), |
|
1187 |
offsetof(vdev_detach_link_t, vdl_node)); |
|
1188 |
||
1189 |
spa_config_enter(spa, RW_READER); |
|
1190 |
spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); |
|
1191 |
spa_config_exit(spa); |
|
1192 |
||
1193 |
while ((vdl = list_head(&vdlist)) != NULL) { |
|
1194 |
list_remove(&vdlist, vdl); |
|
1195 |
(void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, |
|
1196 |
B_TRUE); |
|
1197 |
spa_strfree(vdl->vdl_path); |
|
1198 |
kmem_free(vdl, sizeof (*vdl)); |
|
1199 |
} |
|
1200 |
||
1201 |
list_destroy(&vdlist); |
|
1202 |
} |
|
1203 |
||
1204 |
/* |
|
1354
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1205 |
* Update the stored path for this vdev. Dirty the vdev configuration, relying |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1206 |
* on spa_vdev_enter/exit() to synchronize the labels and cache. |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1207 |
*/ |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1208 |
int |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1209 |
spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1210 |
{ |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1211 |
vdev_t *rvd, *vd; |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1212 |
uint64_t txg; |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1213 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1214 |
rvd = spa->spa_root_vdev; |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1215 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1216 |
txg = spa_vdev_enter(spa); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1217 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1218 |
if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1219 |
return (spa_vdev_exit(spa, NULL, txg, ENOENT)); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1220 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1221 |
spa_strfree(vd->vdev_path); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1222 |
vd->vdev_path = spa_strdup(newpath); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1223 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1224 |
spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1225 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1226 |
vdev_config_dirty(vd->vdev_top); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1227 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1228 |
return (spa_vdev_exit(spa, NULL, txg, 0)); |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1229 |
} |
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1230 |
|
81359ee1ee63
6362672 import gets confused about overlapping slices
eschrock
parents:
1175
diff
changeset
|
1231 |
/* |
789 | 1232 |
* ========================================================================== |
1233 |
* SPA Scrubbing |
|
1234 |
* ========================================================================== |
|
1235 |
*/ |
|
1236 |
||
1237 |
static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); |
|
1238 |
||
1239 |
static void |
|
1240 |
spa_scrub_io_done(zio_t *zio) |
|
1241 |
{ |
|
1242 |
spa_t *spa = zio->io_spa; |
|
1243 |
||
1244 |
zio_buf_free(zio->io_data, zio->io_size); |
|
1245 |
||
1246 |
mutex_enter(&spa->spa_scrub_lock); |
|
1247 |
if (zio->io_error) |
|
1248 |
spa->spa_scrub_errors++; |
|
1249 |
if (--spa->spa_scrub_inflight == 0) |
|
1250 |
cv_broadcast(&spa->spa_scrub_io_cv); |
|
1251 |
mutex_exit(&spa->spa_scrub_lock); |
|
1252 |
||
1253 |
if (zio->io_error) { |
|
1254 |
vdev_t *vd = zio->io_vd; |
|
1255 |
mutex_enter(&vd->vdev_stat_lock); |
|
1256 |
vd->vdev_stat.vs_scrub_errors++; |
|
1257 |
mutex_exit(&vd->vdev_stat_lock); |
|
1258 |
} |
|
1259 |
} |
|
1260 |
||
1261 |
static void |
|
1262 |
spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) |
|
1263 |
{ |
|
1264 |
size_t size = BP_GET_LSIZE(bp); |
|
1265 |
void *data = zio_buf_alloc(size); |
|
1266 |
||
1267 |
mutex_enter(&spa->spa_scrub_lock); |
|
1268 |
spa->spa_scrub_inflight++; |
|
1269 |
mutex_exit(&spa->spa_scrub_lock); |
|
1270 |
||
1271 |
zio_nowait(zio_read(NULL, spa, bp, data, size, |
|
1272 |
spa_scrub_io_done, NULL, priority, flags)); |
|
1273 |
} |
|
1274 |
||
1275 |
/* ARGSUSED */ |
|
1276 |
static int |
|
1277 |
spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) |
|
1278 |
{ |
|
1279 |
blkptr_t *bp = &bc->bc_blkptr; |
|
1280 |
vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); |
|
1281 |
||
1282 |
if (bc->bc_errno || vd == NULL) { |
|
1283 |
/* |
|
1284 |
* We can't scrub this block, but we can continue to scrub |
|
1285 |
* the rest of the pool. Note the error and move along. |
|
1286 |
*/ |
|
1287 |
mutex_enter(&spa->spa_scrub_lock); |
|
1288 |
spa->spa_scrub_errors++; |
|
1289 |
mutex_exit(&spa->spa_scrub_lock); |
|
1290 |
||
1291 |
if (vd != NULL) { |
|
1292 |
mutex_enter(&vd->vdev_stat_lock); |
|
1293 |
vd->vdev_stat.vs_scrub_errors++; |
|
1294 |
mutex_exit(&vd->vdev_stat_lock); |
|
1295 |
} |
|
1296 |
||
1297 |
return (ERESTART); |
|
1298 |
} |
|
1299 |
||
1300 |
ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); |
|
1301 |
||
1302 |
/* |
|
1303 |
* Keep track of how much data we've examined so that |
|
1304 |
* zpool(1M) status can make useful progress reports. |
|
1305 |
*/ |
|
1306 |
mutex_enter(&vd->vdev_stat_lock); |
|
1307 |
vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); |
|
1308 |
mutex_exit(&vd->vdev_stat_lock); |
|
1309 |
||
1310 |
if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { |
|
1311 |
if (DVA_GET_GANG(&bp->blk_dva[0])) { |
|
1312 |
/* |
|
1313 |
* Gang members may be spread across multiple vdevs, |
|
1314 |
* so the best we can do is look at the pool-wide DTL. |
|
1315 |
* XXX -- it would be better to change our allocation |
|
1316 |
* policy to ensure that this can't happen. |
|
1317 |
*/ |
|
1318 |
vd = spa->spa_root_vdev; |
|
1319 |
} |
|
1320 |
if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { |
|
1321 |
spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, |
|
1322 |
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | |
|
1323 |
ZIO_FLAG_RESILVER); |
|
1324 |
} |
|
1325 |
} else { |
|
1326 |
spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, |
|
1327 |
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); |
|
1328 |
} |
|
1329 |
||
1330 |
return (0); |
|
1331 |
} |
|
1332 |
||
1333 |
static void |
|
1334 |
spa_scrub_thread(spa_t *spa) |
|
1335 |
{ |
|
1336 |
callb_cpr_t cprinfo; |
|
1337 |
traverse_handle_t *th = spa->spa_scrub_th; |
|
1338 |
vdev_t *rvd = spa->spa_root_vdev; |
|
1339 |
pool_scrub_type_t scrub_type = spa->spa_scrub_type; |
|
1340 |
int error = 0; |
|
1341 |
boolean_t complete; |
|
1342 |
||
1343 |
CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); |
|
1344 |
||
797
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1345 |
/* |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1346 |
* If we're restarting due to a snapshot create/delete, |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1347 |
* wait for that to complete. |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1348 |
*/ |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1349 |
txg_wait_synced(spa_get_dsl(spa), 0); |
af56ba8b7e41
6344108 snapshot create/delete interlock with scrub/resilver must sync txg
bonwick
parents:
789
diff
changeset
|
1350 |
|
789 | 1351 |
spa_config_enter(spa, RW_WRITER); |
1352 |
vdev_reopen(rvd, NULL); /* purge all vdev caches */ |
|
1353 |
vdev_config_dirty(rvd); /* rewrite all disk labels */ |
|
1354 |
vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); |
|
1355 |
spa_config_exit(spa); |
|
1356 |
||
1357 |
mutex_enter(&spa->spa_scrub_lock); |
|
1358 |
spa->spa_scrub_errors = 0; |
|
1359 |
spa->spa_scrub_active = 1; |
|
1360 |
||
1361 |
while (!spa->spa_scrub_stop) { |
|
1362 |
CALLB_CPR_SAFE_BEGIN(&cprinfo); |
|
1363 |
while (spa->spa_scrub_suspend) { |
|
1364 |
spa->spa_scrub_active = 0; |
|
1365 |
cv_broadcast(&spa->spa_scrub_cv); |
|
1366 |
cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); |
|
1367 |
spa->spa_scrub_active = 1; |
|
1368 |
} |
|
1369 |
CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); |
|
1370 |
||
1371 |
if (spa->spa_scrub_restart_txg != 0) |
|
1372 |
break; |
|
1373 |
||
1374 |
mutex_exit(&spa->spa_scrub_lock); |
|
1375 |
error = traverse_more(th); |
|
1376 |
mutex_enter(&spa->spa_scrub_lock); |
|
1377 |
if (error != EAGAIN) |
|
1378 |
break; |
|
1379 |
} |
|
1380 |
||
1381 |
while (spa->spa_scrub_inflight) |
|
1382 |
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); |
|
1383 |
||
1384 |
if (spa->spa_scrub_restart_txg != 0) |
|
1385 |
error = ERESTART; |
|
1386 |
||
1387 |
spa->spa_scrub_active = 0; |
|
1388 |
cv_broadcast(&spa->spa_scrub_cv); |
|
1389 |
||
1390 |
/* |
|
1391 |
* If the traverse completed, and there were no errors, |
|
1392 |
* then the scrub was completely successful. |
|
1393 |
*/ |
|
1394 |
complete = (error == 0 && spa->spa_scrub_errors == 0); |
|
1395 |
||
1396 |
dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", |
|
1397 |
spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", |
|
1398 |
error, spa->spa_scrub_errors, spa->spa_scrub_stop); |
|
1399 |
||
1400 |
mutex_exit(&spa->spa_scrub_lock); |
|
1401 |
||
1402 |
/* |
|
1403 |
* If the scrub/resilver completed, update all DTLs to reflect this. |
|
1404 |
* Whether it succeeded or not, vacate all temporary scrub DTLs. |
|
1405 |
*/ |
|
1406 |
spa_config_enter(spa, RW_WRITER); |
|
1407 |
vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, |
|
1408 |
complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); |
|
1409 |
spa_config_exit(spa); |
|
1410 |
||
1411 |
spa_vdev_replace_done(spa); |
|
1412 |
||
1413 |
spa_config_enter(spa, RW_READER); |
|
1414 |
vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); |
|
1415 |
spa_config_exit(spa); |
|
1416 |
||
1417 |
mutex_enter(&spa->spa_scrub_lock); |
|
1418 |
||
1419 |
spa->spa_scrub_type = POOL_SCRUB_NONE; |
|
1420 |
spa->spa_scrub_active = 0; |
|
1421 |
spa->spa_scrub_thread = NULL; |
|
1422 |
||
1423 |
cv_broadcast(&spa->spa_scrub_cv); |
|
1424 |
||
1425 |
/* |
|
1426 |
* If we were told to restart, our final act is to start a new scrub. |
|
1427 |
*/ |
|
1428 |
if (error == ERESTART) |
|
1429 |
VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); |
|
1430 |
||
1431 |
CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ |
|
1432 |
thread_exit(); |
|
1433 |
} |
|
1434 |
||
1435 |
void |
|
1436 |
spa_scrub_suspend(spa_t *spa) |
|
1437 |
{ |
|
1438 |
mutex_enter(&spa->spa_scrub_lock); |
|
1439 |
spa->spa_scrub_suspend++; |
|
1440 |
while (spa->spa_scrub_active) { |
|
1441 |
cv_broadcast(&spa->spa_scrub_cv); |
|
1442 |
cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); |
|
1443 |
} |
|
1444 |
while (spa->spa_scrub_inflight) |
|
1445 |
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); |
|
1446 |
mutex_exit(&spa->spa_scrub_lock); |
|
1447 |
} |
|
1448 |
||
1449 |
void |
|
1450 |
spa_scrub_resume(spa_t *spa) |
|
1451 |
{ |
|
1452 |
mutex_enter(&spa->spa_scrub_lock); |
|
1453 |
ASSERT(spa->spa_scrub_suspend != 0); |
|
1454 |
if (--spa->spa_scrub_suspend == 0) |
|
1455 |
cv_broadcast(&spa->spa_scrub_cv); |
|
1456 |
mutex_exit(&spa->spa_scrub_lock); |
|
1457 |
} |
|
1458 |
||
1459 |
void |
|
1460 |
spa_scrub_restart(spa_t *spa, uint64_t txg) |
|
1461 |
{ |
|
1462 |
/* |
|
1463 |
* Something happened (e.g. snapshot create/delete) that means |
|
1464 |
* we must restart any in-progress scrubs. The itinerary will |
|
1465 |
* fix this properly. |
|
1466 |
*/ |
|
1467 |
mutex_enter(&spa->spa_scrub_lock); |
|
1468 |
spa->spa_scrub_restart_txg = txg; |
|
1469 |
mutex_exit(&spa->spa_scrub_lock); |
|
1470 |
} |
|
1471 |
||
1472 |
static int |
|
1473 |
spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) |
|
1474 |
{ |
|
1475 |
space_seg_t *ss; |
|
1476 |
uint64_t mintxg, maxtxg; |
|
1477 |
vdev_t *rvd = spa->spa_root_vdev; |
|
1478 |
int advance = 0; |
|
1479 |
||
1480 |
if ((uint_t)type >= POOL_SCRUB_TYPES) |
|
1481 |
return (ENOTSUP); |
|
1482 |
||
1483 |
/* |
|
1484 |
* If there's a scrub or resilver already in progress, stop it. |
|
1485 |
*/ |
|
1486 |
while (spa->spa_scrub_thread != NULL) { |
|
1487 |
/* |
|
1488 |
* Don't stop a resilver unless forced. |
|
1489 |
*/ |
|
1490 |
if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) |
|
1491 |
return (EBUSY); |
|
1492 |
||
1493 |
spa->spa_scrub_stop = 1; |
|
1494 |
cv_broadcast(&spa->spa_scrub_cv); |
|
1495 |
cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); |
|
1496 |
} |
|
1497 |
||
1498 |
/* |
|
1499 |
* Terminate the previous traverse. |
|
1500 |
*/ |
|
1501 |
if (spa->spa_scrub_th != NULL) { |
|
1502 |
traverse_fini(spa->spa_scrub_th); |
|
1503 |
spa->spa_scrub_th = NULL; |
|
1504 |
} |
|
1505 |
||
1506 |
spa->spa_scrub_stop = 0; |
|
1507 |
spa->spa_scrub_type = type; |
|
1508 |
spa->spa_scrub_restart_txg = 0; |
|
1509 |
||
1510 |
mintxg = TXG_INITIAL - 1; |
|
1511 |
maxtxg = spa_last_synced_txg(spa) + 1; |
|
1512 |
||
1513 |
switch (type) { |
|
1514 |
||
1515 |
case POOL_SCRUB_NONE: |
|
1516 |
break; |
|
1517 |
||
1518 |
case POOL_SCRUB_RESILVER: |
|
1519 |
/* |
|
1520 |
* Determine the resilvering boundaries. |
|
1521 |
* |
|
1522 |
* Note: (mintxg, maxtxg) is an open interval, |
|
1523 |
* i.e. mintxg and maxtxg themselves are not included. |
|
1524 |
* |
|
1525 |
* Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 |
|
1526 |
* so we don't claim to resilver a txg that's still changing. |
|
1527 |
*/ |
|
1528 |
mutex_enter(&rvd->vdev_dtl_lock); |
|
1529 |
ss = avl_first(&rvd->vdev_dtl_map.sm_root); |
|
1530 |
mintxg = ss ? ss->ss_start - 1 : 0; |
|
1531 |
ss = avl_last(&rvd->vdev_dtl_map.sm_root); |
|
1532 |
maxtxg = ss ? ss->ss_end : 0; |
|
1533 |
maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); |
|
1534 |
mutex_exit(&rvd->vdev_dtl_lock); |
|
1535 |
||
1536 |
advance = ADVANCE_PRE | ADVANCE_PRUNE; |
|
1537 |
break; |
|
1538 |
||
1539 |
case POOL_SCRUB_EVERYTHING: |
|
1540 |
/* |
|
1541 |
* A scrub is like a resilver, but not pruned by DTL. |
|
1542 |
*/ |
|
1543 |
advance = ADVANCE_PRE; |
|
1544 |
break; |
|
1545 |
} |
|
1546 |
||
1547 |
if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { |
|
1548 |
spa->spa_scrub_maxtxg = maxtxg; |
|
1549 |
spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, |
|
1550 |
advance, ZIO_FLAG_CANFAIL); |
|
1551 |
traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); |
|
1552 |
spa->spa_scrub_thread = thread_create(NULL, 0, |
|
1553 |
spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); |
|
1554 |
} |
|
1555 |
||
1556 |
return (0); |
|
1557 |
} |
|
1558 |
||
1559 |
int |
|
1560 |
spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) |
|
1561 |
{ |
|
1562 |
int error; |
|
1563 |
traverse_handle_t *th; |
|
1564 |
||
1565 |
mutex_enter(&spa->spa_scrub_lock); |
|
1566 |
error = spa_scrub_locked(spa, type, force); |
|
1567 |
th = spa->spa_scrub_th; |
|
1568 |
mutex_exit(&spa->spa_scrub_lock); |
|
1569 |
||
1570 |
if (th == NULL && type != POOL_SCRUB_NONE) |
|
1571 |
spa_vdev_replace_done(spa); |
|
1572 |
||
1573 |
return (error); |
|
1574 |
} |
|
1575 |
||
1576 |
/* |
|
1577 |
* ========================================================================== |
|
1578 |
* SPA syncing routines |
|
1579 |
* ========================================================================== |
|
1580 |
*/ |
|
1581 |
||
1582 |
static void |
|
1583 |
spa_sync_deferred_frees(spa_t *spa, uint64_t txg) |
|
1584 |
{ |
|
1585 |
bplist_t *bpl = &spa->spa_sync_bplist; |
|
1586 |
dmu_tx_t *tx; |
|
1587 |
blkptr_t blk; |
|
1588 |
uint64_t itor = 0; |
|
1589 |
zio_t *zio; |
|
1590 |
int error; |
|
1591 |
uint8_t c = 1; |
|
1592 |
||
1593 |
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); |
|
1594 |
||
1595 |
while (bplist_iterate(bpl, &itor, &blk) == 0) |
|
1596 |
zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); |
|
1597 |
||
1598 |
error = zio_wait(zio); |
|
1599 |
ASSERT3U(error, ==, 0); |
|
1600 |
||
1601 |
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); |
|
1602 |
bplist_vacate(bpl, tx); |
|
1603 |
||
1604 |
/* |
|
1605 |
* Pre-dirty the first block so we sync to convergence faster. |
|
1606 |
* (Usually only the first block is needed.) |
|
1607 |
*/ |
|
1608 |
dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); |
|
1609 |
dmu_tx_commit(tx); |
|
1610 |
} |
|
1611 |
||
1612 |
static void |
|
1613 |
spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) |
|
1614 |
{ |
|
1615 |
nvlist_t *config; |
|
1616 |
char *packed = NULL; |
|
1617 |
size_t nvsize = 0; |
|
1618 |
dmu_buf_t *db; |
|
1619 |
||
1620 |
if (list_is_empty(&spa->spa_dirty_list)) |
|
1621 |
return; |
|
1622 |
||
1623 |
config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); |
|
1624 |
||
1625 |
spa_config_set(spa, config); |
|
1626 |
||
1627 |
VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); |
|
1628 |
||
1629 |
packed = kmem_alloc(nvsize, KM_SLEEP); |
|
1630 |
||
1631 |
VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); |
|
1632 |
||
1633 |
dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, |
|
1634 |
packed, tx); |
|
1635 |
||
1636 |
kmem_free(packed, nvsize); |
|
1637 |
||
1638 |
db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); |
|
1639 |
dmu_buf_will_dirty(db, tx); |
|
1640 |
*(uint64_t *)db->db_data = nvsize; |
|
1641 |
dmu_buf_rele(db); |
|
1642 |
} |
|
1643 |
||
1644 |
/* |
|
1645 |
* Sync the specified transaction group. New blocks may be dirtied as |
|
1646 |
* part of the process, so we iterate until it converges. |
|
1647 |
*/ |
|
1648 |
void |
|
1649 |
spa_sync(spa_t *spa, uint64_t txg) |
|
1650 |
{ |
|
1651 |
dsl_pool_t *dp = spa->spa_dsl_pool; |
|
1652 |
objset_t *mos = spa->spa_meta_objset; |
|
1653 |
bplist_t *bpl = &spa->spa_sync_bplist; |
|
1654 |
vdev_t *rvd = spa->spa_root_vdev; |
|
1655 |
vdev_t *vd; |
|
1656 |
dmu_tx_t *tx; |
|
1657 |
int dirty_vdevs; |
|
1658 |
||
1659 |
/* |
|
1660 |
* Lock out configuration changes. |
|
1661 |
*/ |
|
1662 |
spa_config_enter(spa, RW_READER); |
|
1663 |
||
1664 |
spa->spa_syncing_txg = txg; |
|
1665 |
spa->spa_sync_pass = 0; |
|
1666 |
||
1667 |
bplist_open(bpl, mos, spa->spa_sync_bplist_obj); |
|
1668 |
||
1669 |
/* |
|
1670 |
* If anything has changed in this txg, push the deferred frees |
|
1671 |
* from the previous txg. If not, leave them alone so that we |
|
1672 |
* don't generate work on an otherwise idle system. |
|
1673 |
*/ |
|
1674 |
if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || |
|
1675 |
!txg_list_empty(&dp->dp_dirty_dirs, txg)) |
|
1676 |
spa_sync_deferred_frees(spa, txg); |
|
1677 |
||
1678 |
/* |
|
1679 |
* Iterate to convergence. |
|
1680 |
*/ |
|
1681 |
do { |
|
1682 |
spa->spa_sync_pass++; |
|
1683 |
||
1684 |
tx = dmu_tx_create_assigned(dp, txg); |
|
1685 |
spa_sync_config_object(spa, tx); |
|
1686 |
dmu_tx_commit(tx); |
|
1687 |
||
1688 |
dsl_pool_sync(dp, txg); |
|
1689 |
||
1690 |
dirty_vdevs = 0; |
|
1691 |
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { |
|
1692 |
vdev_sync(vd, txg); |
|
1693 |
dirty_vdevs++; |
|
1694 |
} |
|
1695 |
||
1696 |
tx = dmu_tx_create_assigned(dp, txg); |
|
1697 |
bplist_sync(bpl, tx); |
|
1698 |
dmu_tx_commit(tx); |
|
1699 |
||
1700 |
} while (dirty_vdevs); |
|
1701 |
||
1702 |
bplist_close(bpl); |
|
1703 |
||
1704 |
dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); |
|
1705 |
||
1706 |
/* |
|
1707 |
* Rewrite the vdev configuration (which includes the uberblock) |
|
1708 |
* to commit the transaction group. |
|
1709 |
*/ |
|
1710 |
while (spa_sync_labels(spa, txg)) { |
|
1711 |
dprintf("waiting for devices to heal\n"); |
|
1712 |
delay(hz); |
|
1713 |
vdev_reopen(rvd, NULL); |
|
1714 |
} |
|
1715 |
||
1716 |
/* |
|
1717 |
* Make a stable copy of the fully synced uberblock. |
|
1718 |
* We use this as the root for pool traversals. |
|
1719 |
*/ |
|
1720 |
spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ |
|
1721 |
||
1722 |
spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ |
|
1723 |
||
1724 |
rw_enter(&spa->spa_traverse_lock, RW_WRITER); |
|
1725 |
spa->spa_traverse_wanted = 0; |
|
1726 |
spa->spa_ubsync = spa->spa_uberblock; |
|
1727 |
rw_exit(&spa->spa_traverse_lock); |
|
1728 |
||
1729 |
spa_scrub_resume(spa); /* resume scrub with new ubsync */ |
|
1730 |
||
1731 |
/* |
|
1732 |
* Clean up the ZIL records for the synced txg. |
|
1733 |
*/ |
|
1734 |
dsl_pool_zil_clean(dp); |
|
1735 |
||
1736 |
/* |
|
1737 |
* Update usable space statistics. |
|
1738 |
*/ |
|
1739 |
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) |
|
1740 |
vdev_sync_done(vd, txg); |
|
1741 |
||
1742 |
/* |
|
1743 |
* It had better be the case that we didn't dirty anything |
|
1744 |
* since spa_sync_labels(). |
|
1745 |
*/ |
|
1746 |
ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); |
|
1747 |
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); |
|
1748 |
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); |
|
1749 |
ASSERT(bpl->bpl_queue == NULL); |
|
1750 |
||
1751 |
spa_config_exit(spa); |
|
1752 |
} |
|
1753 |
||
1754 |
/* |
|
1755 |
* Sync all pools. We don't want to hold the namespace lock across these |
|
1756 |
* operations, so we take a reference on the spa_t and drop the lock during the |
|
1757 |
* sync. |
|
1758 |
*/ |
|
1759 |
void |
|
1760 |
spa_sync_allpools(void) |
|
1761 |
{ |
|
1762 |
spa_t *spa = NULL; |
|
1763 |
mutex_enter(&spa_namespace_lock); |
|
1764 |
while ((spa = spa_next(spa)) != NULL) { |
|
1765 |
if (spa_state(spa) != POOL_STATE_ACTIVE) |
|
1766 |
continue; |
|
1767 |
spa_open_ref(spa, FTAG); |
|
1768 |
mutex_exit(&spa_namespace_lock); |
|
1769 |
txg_wait_synced(spa_get_dsl(spa), 0); |
|
1770 |
mutex_enter(&spa_namespace_lock); |
|
1771 |
spa_close(spa, FTAG); |
|
1772 |
} |
|
1773 |
mutex_exit(&spa_namespace_lock); |
|
1774 |
} |
|
1775 |
||
1776 |
/* |
|
1777 |
* ========================================================================== |
|
1778 |
* Miscellaneous routines |
|
1779 |
* ========================================================================== |
|
1780 |
*/ |
|
1781 |
||
1782 |
int |
|
1783 |
spa_busy(void) |
|
1784 |
{ |
|
1785 |
return (spa_active_count != 0); |
|
1786 |
} |
|
1787 |
||
1788 |
/* |
|
1789 |
* Remove all pools in the system. |
|
1790 |
*/ |
|
1791 |
void |
|
1792 |
spa_evict_all(void) |
|
1793 |
{ |
|
1794 |
spa_t *spa; |
|
1795 |
||
1796 |
/* |
|
1797 |
* Remove all cached state. All pools should be closed now, |
|
1798 |
* so every spa in the AVL tree should be unreferenced. |
|
1799 |
*/ |
|
1800 |
mutex_enter(&spa_namespace_lock); |
|
1801 |
while ((spa = spa_next(NULL)) != NULL) { |
|
1802 |
/* |
|
1803 |
* Stop all scrub and resilver activity. spa_scrub() needs to |
|
1804 |
* wait for the scrub thread, which may do a detach and sync the |
|
1805 |
* configs, which needs spa_namespace_lock. Drop the lock while |
|
1806 |
* maintaining a hold on the spa_t. |
|
1807 |
*/ |
|
1808 |
spa_open_ref(spa, FTAG); |
|
1809 |
mutex_exit(&spa_namespace_lock); |
|
1810 |
VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); |
|
1811 |
mutex_enter(&spa_namespace_lock); |
|
1812 |
spa_close(spa, FTAG); |
|
1813 |
||
1814 |
if (spa->spa_state != POOL_STATE_UNINITIALIZED) { |
|
1815 |
spa_unload(spa); |
|
1816 |
spa_deactivate(spa); |
|
1817 |
} |
|
1818 |
spa_remove(spa); |
|
1819 |
} |
|
1820 |
mutex_exit(&spa_namespace_lock); |
|
1821 |
} |