author | bonwick |
Sun, 02 Apr 2006 00:47:06 -0800 | |
changeset 1732 | 9e3ae798af31 |
parent 1544 | 938876158511 |
child 1775 | e51e26b432c0 |
permissions | -rw-r--r-- |
789 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
1544 | 5 |
* Common Development and Distribution License (the "License"). |
6 |
* You may not use this file except in compliance with the License. |
|
789 | 7 |
* |
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
21 |
/* |
|
1544 | 22 |
* Copyright 2006 Sun Microsystems, Inc. All rights reserved. |
789 | 23 |
* Use is subject to license terms. |
24 |
*/ |
|
25 |
||
26 |
#pragma ident "%Z%%M% %I% %E% SMI" |
|
27 |
||
28 |
#include <sys/zfs_context.h> |
|
29 |
#include <sys/spa.h> |
|
30 |
#include <sys/vdev_impl.h> |
|
31 |
#include <sys/zio.h> |
|
32 |
#include <sys/zio_checksum.h> |
|
33 |
#include <sys/fs/zfs.h> |
|
1544 | 34 |
#include <sys/fm/fs/zfs.h> |
789 | 35 |
|
36 |
/* |
|
37 |
* Virtual device vector for RAID-Z. |
|
38 |
*/ |
|
39 |
||
40 |
/* |
|
41 |
* We currently allow up to two-way replication (i.e. single-fault |
|
42 |
* reconstruction) models in RAID-Z vdevs. The blocks in such vdevs |
|
43 |
* must all be multiples of two times the leaf vdev blocksize. |
|
44 |
*/ |
|
45 |
#define VDEV_RAIDZ_ALIGN 2ULL |
|
46 |
||
47 |
typedef struct raidz_col { |
|
48 |
uint64_t rc_col; |
|
49 |
uint64_t rc_offset; |
|
50 |
uint64_t rc_size; |
|
51 |
void *rc_data; |
|
52 |
int rc_error; |
|
53 |
short rc_tried; |
|
54 |
short rc_skipped; |
|
55 |
} raidz_col_t; |
|
56 |
||
57 |
typedef struct raidz_map { |
|
58 |
uint64_t rm_cols; |
|
59 |
uint64_t rm_bigcols; |
|
60 |
uint64_t rm_asize; |
|
61 |
int rm_missing_child; |
|
62 |
int rm_firstdatacol; |
|
63 |
raidz_col_t rm_col[1]; |
|
64 |
} raidz_map_t; |
|
65 |
||
66 |
static raidz_map_t * |
|
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
67 |
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols) |
789 | 68 |
{ |
69 |
raidz_map_t *rm; |
|
70 |
uint64_t b = zio->io_offset >> unit_shift; |
|
71 |
uint64_t s = zio->io_size >> unit_shift; |
|
72 |
uint64_t f = b % dcols; |
|
73 |
uint64_t o = (b / dcols) << unit_shift; |
|
74 |
uint64_t q, r, c, bc, col, acols, coff; |
|
75 |
int firstdatacol; |
|
76 |
||
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
77 |
q = s / (dcols - 1); |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
78 |
r = s - q * (dcols - 1); |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
79 |
bc = r + !!r; |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
80 |
firstdatacol = 1; |
789 | 81 |
|
82 |
acols = (q == 0 ? bc : dcols); |
|
83 |
||
84 |
rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); |
|
85 |
||
86 |
rm->rm_cols = acols; |
|
87 |
rm->rm_bigcols = bc; |
|
88 |
rm->rm_asize = 0; |
|
89 |
rm->rm_missing_child = -1; |
|
90 |
rm->rm_firstdatacol = firstdatacol; |
|
91 |
||
92 |
for (c = 0; c < acols; c++) { |
|
93 |
col = f + c; |
|
94 |
coff = o; |
|
95 |
if (col >= dcols) { |
|
96 |
col -= dcols; |
|
97 |
coff += 1ULL << unit_shift; |
|
98 |
} |
|
99 |
rm->rm_col[c].rc_col = col; |
|
100 |
rm->rm_col[c].rc_offset = coff; |
|
101 |
rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; |
|
102 |
rm->rm_col[c].rc_data = NULL; |
|
103 |
rm->rm_col[c].rc_error = 0; |
|
104 |
rm->rm_col[c].rc_tried = 0; |
|
105 |
rm->rm_col[c].rc_skipped = 0; |
|
106 |
rm->rm_asize += rm->rm_col[c].rc_size; |
|
107 |
} |
|
108 |
||
109 |
rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift); |
|
110 |
||
111 |
for (c = 0; c < rm->rm_firstdatacol; c++) |
|
112 |
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); |
|
113 |
||
114 |
rm->rm_col[c].rc_data = zio->io_data; |
|
115 |
||
116 |
for (c = c + 1; c < acols; c++) |
|
117 |
rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + |
|
118 |
rm->rm_col[c - 1].rc_size; |
|
119 |
||
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
120 |
/* |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
121 |
* To prevent hot parity disks, switch the parity and data |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
122 |
* columns every 1MB. |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
123 |
*/ |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
124 |
ASSERT(rm->rm_cols >= 2); |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
125 |
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); |
789 | 126 |
|
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
127 |
if (zio->io_offset & (1ULL << 20)) { |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
128 |
col = rm->rm_col[0].rc_col; |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
129 |
o = rm->rm_col[0].rc_offset; |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
130 |
rm->rm_col[0].rc_col = rm->rm_col[1].rc_col; |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
131 |
rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
132 |
rm->rm_col[1].rc_col = col; |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
133 |
rm->rm_col[1].rc_offset = o; |
789 | 134 |
} |
135 |
||
136 |
zio->io_vsd = rm; |
|
137 |
return (rm); |
|
138 |
} |
|
139 |
||
140 |
static void |
|
141 |
vdev_raidz_map_free(zio_t *zio) |
|
142 |
{ |
|
143 |
raidz_map_t *rm = zio->io_vsd; |
|
144 |
int c; |
|
145 |
||
146 |
for (c = 0; c < rm->rm_firstdatacol; c++) |
|
147 |
zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); |
|
148 |
||
149 |
kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); |
|
150 |
zio->io_vsd = NULL; |
|
151 |
} |
|
152 |
||
153 |
static void |
|
154 |
vdev_raidz_reconstruct(raidz_map_t *rm, int x) |
|
155 |
{ |
|
156 |
uint64_t *dst, *src, count, xsize, csize; |
|
157 |
int i, c; |
|
158 |
||
159 |
for (c = 0; c < rm->rm_cols; c++) { |
|
160 |
if (c == x) |
|
161 |
continue; |
|
162 |
src = rm->rm_col[c].rc_data; |
|
163 |
dst = rm->rm_col[x].rc_data; |
|
164 |
csize = rm->rm_col[c].rc_size; |
|
165 |
xsize = rm->rm_col[x].rc_size; |
|
166 |
count = MIN(csize, xsize) / sizeof (uint64_t); |
|
167 |
if (c == !x) { |
|
168 |
/* |
|
169 |
* The initial copy happens at either c == 0 or c == 1. |
|
170 |
* Both of these columns are 'big' columns, so we'll |
|
171 |
* definitely initialize all of column x. |
|
172 |
*/ |
|
173 |
ASSERT3U(xsize, <=, csize); |
|
174 |
for (i = 0; i < count; i++) |
|
175 |
*dst++ = *src++; |
|
176 |
} else { |
|
177 |
for (i = 0; i < count; i++) |
|
178 |
*dst++ ^= *src++; |
|
179 |
} |
|
180 |
} |
|
181 |
} |
|
182 |
||
183 |
static int |
|
184 |
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) |
|
185 |
{ |
|
186 |
vdev_t *cvd; |
|
187 |
int c, error; |
|
188 |
int lasterror = 0; |
|
189 |
int numerrors = 0; |
|
190 |
||
191 |
/* |
|
192 |
* XXX -- minimum children should be raid-type-specific |
|
193 |
*/ |
|
194 |
if (vd->vdev_children < 2) { |
|
195 |
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; |
|
196 |
return (EINVAL); |
|
197 |
} |
|
198 |
||
199 |
for (c = 0; c < vd->vdev_children; c++) { |
|
200 |
cvd = vd->vdev_child[c]; |
|
201 |
||
202 |
if ((error = vdev_open(cvd)) != 0) { |
|
203 |
lasterror = error; |
|
204 |
numerrors++; |
|
205 |
continue; |
|
206 |
} |
|
207 |
||
208 |
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; |
|
1732 | 209 |
*ashift = MAX(*ashift, cvd->vdev_ashift); |
789 | 210 |
} |
211 |
||
212 |
*asize *= vd->vdev_children; |
|
213 |
||
214 |
if (numerrors > 1) { |
|
215 |
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; |
|
216 |
return (lasterror); |
|
217 |
} |
|
218 |
||
219 |
return (0); |
|
220 |
} |
|
221 |
||
222 |
static void |
|
223 |
vdev_raidz_close(vdev_t *vd) |
|
224 |
{ |
|
225 |
int c; |
|
226 |
||
227 |
for (c = 0; c < vd->vdev_children; c++) |
|
228 |
vdev_close(vd->vdev_child[c]); |
|
229 |
} |
|
230 |
||
231 |
static uint64_t |
|
232 |
vdev_raidz_asize(vdev_t *vd, uint64_t psize) |
|
233 |
{ |
|
234 |
uint64_t asize; |
|
1732 | 235 |
uint64_t ashift = vd->vdev_top->vdev_ashift; |
789 | 236 |
uint64_t cols = vd->vdev_children; |
237 |
||
1732 | 238 |
asize = ((psize - 1) >> ashift) + 1; |
789 | 239 |
asize += (asize + cols - 2) / (cols - 1); |
1732 | 240 |
asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << ashift; |
789 | 241 |
|
242 |
return (asize); |
|
243 |
} |
|
244 |
||
245 |
static void |
|
246 |
vdev_raidz_child_done(zio_t *zio) |
|
247 |
{ |
|
248 |
raidz_col_t *rc = zio->io_private; |
|
249 |
||
250 |
rc->rc_error = zio->io_error; |
|
251 |
rc->rc_tried = 1; |
|
252 |
rc->rc_skipped = 0; |
|
253 |
} |
|
254 |
||
255 |
static void |
|
256 |
vdev_raidz_repair_done(zio_t *zio) |
|
257 |
{ |
|
1732 | 258 |
ASSERT(zio->io_private == zio->io_parent); |
259 |
vdev_raidz_map_free(zio->io_private); |
|
789 | 260 |
} |
261 |
||
262 |
static void |
|
263 |
vdev_raidz_io_start(zio_t *zio) |
|
264 |
{ |
|
265 |
vdev_t *vd = zio->io_vd; |
|
1732 | 266 |
vdev_t *tvd = vd->vdev_top; |
789 | 267 |
vdev_t *cvd; |
268 |
blkptr_t *bp = zio->io_bp; |
|
269 |
raidz_map_t *rm; |
|
270 |
raidz_col_t *rc; |
|
271 |
int c; |
|
272 |
||
1732 | 273 |
rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children); |
789 | 274 |
|
275 |
if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { |
|
276 |
ASSERT3U(rm->rm_asize, ==, |
|
277 |
vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); |
|
278 |
} else { |
|
279 |
ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); |
|
280 |
} |
|
281 |
||
282 |
if (zio->io_type == ZIO_TYPE_WRITE) { |
|
283 |
||
284 |
/* |
|
285 |
* Generate RAID parity in virtual column 0. |
|
286 |
*/ |
|
287 |
vdev_raidz_reconstruct(rm, 0); |
|
288 |
||
289 |
for (c = 0; c < rm->rm_cols; c++) { |
|
290 |
rc = &rm->rm_col[c]; |
|
291 |
cvd = vd->vdev_child[rc->rc_col]; |
|
292 |
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, |
|
293 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
|
294 |
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, |
|
295 |
vdev_raidz_child_done, rc)); |
|
296 |
} |
|
297 |
zio_wait_children_done(zio); |
|
298 |
return; |
|
299 |
} |
|
300 |
||
301 |
ASSERT(zio->io_type == ZIO_TYPE_READ); |
|
302 |
||
303 |
for (c = rm->rm_cols - 1; c >= 0; c--) { |
|
304 |
rc = &rm->rm_col[c]; |
|
305 |
cvd = vd->vdev_child[rc->rc_col]; |
|
306 |
if (vdev_is_dead(cvd)) { |
|
307 |
rm->rm_missing_child = c; |
|
308 |
rc->rc_error = ENXIO; |
|
309 |
rc->rc_tried = 1; /* don't even try */ |
|
310 |
rc->rc_skipped = 1; |
|
311 |
continue; |
|
312 |
} |
|
313 |
if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { |
|
314 |
rm->rm_missing_child = c; |
|
315 |
rc->rc_error = ESTALE; |
|
316 |
rc->rc_skipped = 1; |
|
317 |
continue; |
|
318 |
} |
|
319 |
if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 || |
|
320 |
(zio->io_flags & ZIO_FLAG_SCRUB)) { |
|
321 |
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, |
|
322 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
|
323 |
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, |
|
324 |
vdev_raidz_child_done, rc)); |
|
325 |
} |
|
326 |
} |
|
327 |
||
328 |
zio_wait_children_done(zio); |
|
329 |
} |
|
330 |
||
1544 | 331 |
/* |
332 |
* Report a checksum error for a child of a RAID-Z device. |
|
333 |
*/ |
|
334 |
static void |
|
335 |
raidz_checksum_error(zio_t *zio, raidz_col_t *rc) |
|
336 |
{ |
|
337 |
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col]; |
|
338 |
dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", |
|
339 |
vdev_description(vd)); |
|
340 |
||
341 |
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { |
|
342 |
mutex_enter(&vd->vdev_stat_lock); |
|
343 |
vd->vdev_stat.vs_checksum_errors++; |
|
344 |
mutex_exit(&vd->vdev_stat_lock); |
|
345 |
} |
|
346 |
||
347 |
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) |
|
348 |
zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, |
|
349 |
zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); |
|
350 |
} |
|
351 |
||
352 |
||
789 | 353 |
static void |
354 |
vdev_raidz_io_done(zio_t *zio) |
|
355 |
{ |
|
356 |
vdev_t *vd = zio->io_vd; |
|
357 |
vdev_t *cvd; |
|
358 |
raidz_map_t *rm = zio->io_vsd; |
|
359 |
raidz_col_t *rc; |
|
360 |
blkptr_t *bp = zio->io_bp; |
|
361 |
int unexpected_errors = 0; |
|
362 |
int c; |
|
363 |
||
364 |
ASSERT(bp != NULL); /* XXX need to add code to enforce this */ |
|
365 |
||
366 |
zio->io_error = 0; |
|
367 |
zio->io_numerrors = 0; |
|
368 |
||
369 |
for (c = 0; c < rm->rm_cols; c++) { |
|
370 |
rc = &rm->rm_col[c]; |
|
371 |
||
372 |
/* |
|
373 |
* We preserve any EIOs because those may be worth retrying; |
|
374 |
* whereas ECKSUM and ENXIO are more likely to be persistent. |
|
375 |
*/ |
|
376 |
if (rc->rc_error) { |
|
377 |
if (zio->io_error != EIO) |
|
378 |
zio->io_error = rc->rc_error; |
|
379 |
if (!rc->rc_skipped) |
|
380 |
unexpected_errors++; |
|
381 |
zio->io_numerrors++; |
|
382 |
} |
|
383 |
} |
|
384 |
||
385 |
if (zio->io_type == ZIO_TYPE_WRITE) { |
|
386 |
/* |
|
387 |
* If this is not a failfast write, and we were able to |
|
388 |
* write enough columns to reconstruct the data, good enough. |
|
389 |
*/ |
|
390 |
/* XXPOLICY */ |
|
391 |
if (zio->io_numerrors <= rm->rm_firstdatacol && |
|
392 |
!(zio->io_flags & ZIO_FLAG_FAILFAST)) |
|
393 |
zio->io_error = 0; |
|
394 |
||
395 |
vdev_raidz_map_free(zio); |
|
396 |
zio_next_stage(zio); |
|
397 |
return; |
|
398 |
} |
|
399 |
||
400 |
ASSERT(zio->io_type == ZIO_TYPE_READ); |
|
401 |
||
402 |
/* |
|
403 |
* If there were no I/O errors, and the data checksums correctly, |
|
404 |
* the read is complete. |
|
405 |
*/ |
|
406 |
/* XXPOLICY */ |
|
407 |
if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) { |
|
408 |
ASSERT(unexpected_errors == 0); |
|
409 |
ASSERT(zio->io_error == 0); |
|
410 |
||
411 |
/* |
|
412 |
* We know the data's good. If we read the parity, |
|
413 |
* verify that it's good as well. If not, fix it. |
|
414 |
*/ |
|
415 |
for (c = 0; c < rm->rm_firstdatacol; c++) { |
|
416 |
void *orig; |
|
417 |
rc = &rm->rm_col[c]; |
|
418 |
if (!rc->rc_tried) |
|
419 |
continue; |
|
420 |
orig = zio_buf_alloc(rc->rc_size); |
|
421 |
bcopy(rc->rc_data, orig, rc->rc_size); |
|
422 |
vdev_raidz_reconstruct(rm, c); |
|
423 |
if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { |
|
1544 | 424 |
raidz_checksum_error(zio, rc); |
789 | 425 |
rc->rc_error = ECKSUM; |
426 |
unexpected_errors++; |
|
427 |
} |
|
428 |
zio_buf_free(orig, rc->rc_size); |
|
429 |
} |
|
430 |
goto done; |
|
431 |
} |
|
432 |
||
433 |
/* |
|
434 |
* If there was exactly one I/O error, it's the one we expected, |
|
435 |
* and the reconstructed data checksums, the read is complete. |
|
436 |
* This happens when one child is offline and vdev_fault_assess() |
|
437 |
* knows it, or when one child has stale data and the DTL knows it. |
|
438 |
*/ |
|
439 |
if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) { |
|
440 |
rc = &rm->rm_col[c]; |
|
441 |
ASSERT(unexpected_errors == 0); |
|
442 |
ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE); |
|
443 |
vdev_raidz_reconstruct(rm, c); |
|
444 |
if (zio_checksum_error(zio) == 0) { |
|
445 |
zio->io_error = 0; |
|
446 |
goto done; |
|
447 |
} |
|
448 |
} |
|
449 |
||
450 |
/* |
|
451 |
* This isn't a typical error -- either we got a read error or |
|
452 |
* more than one child claimed a problem. Read every block we |
|
453 |
* haven't already so we can try combinatorial reconstruction. |
|
454 |
*/ |
|
455 |
unexpected_errors = 1; |
|
456 |
rm->rm_missing_child = -1; |
|
457 |
||
458 |
for (c = 0; c < rm->rm_cols; c++) |
|
459 |
if (!rm->rm_col[c].rc_tried) |
|
460 |
break; |
|
461 |
||
462 |
if (c != rm->rm_cols) { |
|
463 |
zio->io_error = 0; |
|
464 |
zio_vdev_io_redone(zio); |
|
465 |
for (c = 0; c < rm->rm_cols; c++) { |
|
466 |
rc = &rm->rm_col[c]; |
|
467 |
if (rc->rc_tried) |
|
468 |
continue; |
|
469 |
zio_nowait(zio_vdev_child_io(zio, NULL, |
|
470 |
vd->vdev_child[rc->rc_col], |
|
471 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
|
472 |
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, |
|
473 |
vdev_raidz_child_done, rc)); |
|
474 |
} |
|
475 |
zio_wait_children_done(zio); |
|
476 |
return; |
|
477 |
} |
|
478 |
||
479 |
/* |
|
480 |
* If there were more errors than parity disks, give up. |
|
481 |
*/ |
|
482 |
if (zio->io_numerrors > rm->rm_firstdatacol) { |
|
483 |
ASSERT(zio->io_error != 0); |
|
484 |
goto done; |
|
485 |
} |
|
486 |
||
487 |
/* |
|
488 |
* The number of I/O errors is correctable. Correct them here. |
|
489 |
*/ |
|
490 |
ASSERT(zio->io_numerrors <= rm->rm_firstdatacol); |
|
491 |
for (c = 0; c < rm->rm_cols; c++) { |
|
492 |
rc = &rm->rm_col[c]; |
|
493 |
ASSERT(rc->rc_tried); |
|
494 |
if (rc->rc_error) { |
|
495 |
vdev_raidz_reconstruct(rm, c); |
|
496 |
if (zio_checksum_error(zio) == 0) |
|
497 |
zio->io_error = 0; |
|
498 |
else |
|
499 |
zio->io_error = rc->rc_error; |
|
500 |
goto done; |
|
501 |
} |
|
502 |
} |
|
503 |
||
504 |
/* |
|
505 |
* There were no I/O errors, but the data doesn't checksum. |
|
506 |
* Try all permutations to see if we can find one that does. |
|
507 |
*/ |
|
508 |
ASSERT(zio->io_numerrors == 0); |
|
509 |
for (c = 0; c < rm->rm_cols; c++) { |
|
510 |
void *orig; |
|
511 |
rc = &rm->rm_col[c]; |
|
512 |
||
513 |
orig = zio_buf_alloc(rc->rc_size); |
|
514 |
bcopy(rc->rc_data, orig, rc->rc_size); |
|
515 |
vdev_raidz_reconstruct(rm, c); |
|
516 |
||
517 |
if (zio_checksum_error(zio) == 0) { |
|
518 |
zio_buf_free(orig, rc->rc_size); |
|
519 |
zio->io_error = 0; |
|
520 |
/* |
|
521 |
* If this child didn't know that it returned bad data, |
|
522 |
* inform it. |
|
523 |
*/ |
|
524 |
if (rc->rc_tried && rc->rc_error == 0) |
|
1544 | 525 |
raidz_checksum_error(zio, rc); |
789 | 526 |
rc->rc_error = ECKSUM; |
527 |
goto done; |
|
528 |
} |
|
529 |
||
530 |
bcopy(orig, rc->rc_data, rc->rc_size); |
|
531 |
zio_buf_free(orig, rc->rc_size); |
|
532 |
} |
|
533 |
||
534 |
/* |
|
1544 | 535 |
* All combinations failed to checksum. Generate checksum ereports for |
536 |
* every one. |
|
789 | 537 |
*/ |
538 |
zio->io_error = ECKSUM; |
|
1544 | 539 |
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { |
540 |
for (c = 0; c < rm->rm_cols; c++) { |
|
541 |
rc = &rm->rm_col[c]; |
|
542 |
zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, |
|
543 |
zio->io_spa, vd->vdev_child[rc->rc_col], zio, |
|
544 |
rc->rc_offset, rc->rc_size); |
|
545 |
} |
|
546 |
} |
|
789 | 547 |
|
548 |
done: |
|
549 |
zio_checksum_verified(zio); |
|
550 |
||
551 |
if (zio->io_error == 0 && (spa_mode & FWRITE) && |
|
552 |
(unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { |
|
1732 | 553 |
zio_t *rio; |
554 |
||
789 | 555 |
/* |
556 |
* Use the good data we have in hand to repair damaged children. |
|
1732 | 557 |
* |
558 |
* We issue all repair I/Os as children of 'rio' to arrange |
|
559 |
* that vdev_raidz_map_free(zio) will be invoked after all |
|
560 |
* repairs complete, but before we advance to the next stage. |
|
789 | 561 |
*/ |
1732 | 562 |
rio = zio_null(zio, zio->io_spa, |
563 |
vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); |
|
564 |
||
789 | 565 |
for (c = 0; c < rm->rm_cols; c++) { |
566 |
rc = &rm->rm_col[c]; |
|
567 |
cvd = vd->vdev_child[rc->rc_col]; |
|
568 |
||
1732 | 569 |
if (rc->rc_error == 0) |
570 |
continue; |
|
571 |
||
572 |
dprintf("%s resilvered %s @ 0x%llx error %d\n", |
|
573 |
vdev_description(vd), |
|
574 |
vdev_description(cvd), |
|
575 |
zio->io_offset, rc->rc_error); |
|
789 | 576 |
|
1732 | 577 |
zio_nowait(zio_vdev_child_io(rio, NULL, cvd, |
578 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
|
579 |
ZIO_TYPE_WRITE, zio->io_priority, |
|
580 |
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | |
|
581 |
ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); |
|
582 |
} |
|
789 | 583 |
|
1732 | 584 |
zio_nowait(rio); |
585 |
zio_wait_children_done(zio); |
|
586 |
return; |
|
789 | 587 |
} |
588 |
||
589 |
vdev_raidz_map_free(zio); |
|
590 |
zio_next_stage(zio); |
|
591 |
} |
|
592 |
||
593 |
static void |
|
594 |
vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) |
|
595 |
{ |
|
596 |
if (faulted > 1) |
|
1544 | 597 |
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, |
598 |
VDEV_AUX_NO_REPLICAS); |
|
789 | 599 |
else if (degraded + faulted != 0) |
1544 | 600 |
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); |
789 | 601 |
else |
1544 | 602 |
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); |
789 | 603 |
} |
604 |
||
605 |
vdev_ops_t vdev_raidz_ops = { |
|
606 |
vdev_raidz_open, |
|
607 |
vdev_raidz_close, |
|
608 |
vdev_raidz_asize, |
|
609 |
vdev_raidz_io_start, |
|
610 |
vdev_raidz_io_done, |
|
611 |
vdev_raidz_state_change, |
|
612 |
VDEV_TYPE_RAIDZ, /* name of this vdev type */ |
|
613 |
B_FALSE /* not a leaf vdev */ |
|
614 |
}; |