author | gw25295 |
Mon, 29 Oct 2007 17:24:59 -0700 | |
changeset 5369 | 27c1235ef9a4 |
parent 5329 | 33cb98223b2d |
child 5530 | 4ed96167d864 |
permissions | -rw-r--r-- |
789 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
1544 | 5 |
* Common Development and Distribution License (the "License"). |
6 |
* You may not use this file except in compliance with the License. |
|
789 | 7 |
* |
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
2082 | 21 |
|
789 | 22 |
/* |
3456 | 23 |
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. |
789 | 24 |
* Use is subject to license terms. |
25 |
*/ |
|
26 |
||
27 |
#pragma ident "%Z%%M% %I% %E% SMI" |
|
28 |
||
29 |
#include <sys/zfs_context.h> |
|
30 |
#include <sys/spa.h> |
|
31 |
#include <sys/vdev_impl.h> |
|
32 |
#include <sys/zio.h> |
|
33 |
#include <sys/zio_checksum.h> |
|
34 |
#include <sys/fs/zfs.h> |
|
1544 | 35 |
#include <sys/fm/fs/zfs.h> |
789 | 36 |
|
37 |
/* |
|
38 |
* Virtual device vector for RAID-Z. |
|
2082 | 39 |
* |
40 |
* This vdev supports both single and double parity. For single parity, we |
|
41 |
* use a simple XOR of all the data columns. For double parity, we use both |
|
42 |
* the simple XOR as well as a technique described in "The mathematics of |
|
43 |
* RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), |
|
44 |
* over the integers expressable in a single byte. Briefly, the operations on |
|
45 |
* the field are defined as follows: |
|
46 |
* |
|
47 |
* o addition (+) is represented by a bitwise XOR |
|
48 |
* o subtraction (-) is therefore identical to addition: A + B = A - B |
|
49 |
* o multiplication of A by 2 is defined by the following bitwise expression: |
|
50 |
* (A * 2)_7 = A_6 |
|
51 |
* (A * 2)_6 = A_5 |
|
52 |
* (A * 2)_5 = A_4 |
|
53 |
* (A * 2)_4 = A_3 + A_7 |
|
54 |
* (A * 2)_3 = A_2 + A_7 |
|
55 |
* (A * 2)_2 = A_1 + A_7 |
|
56 |
* (A * 2)_1 = A_0 |
|
57 |
* (A * 2)_0 = A_7 |
|
58 |
* |
|
59 |
* In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). |
|
60 |
* |
|
61 |
* Observe that any number in the field (except for 0) can be expressed as a |
|
62 |
* power of 2 -- a generator for the field. We store a table of the powers of |
|
63 |
* 2 and logs base 2 for quick look ups, and exploit the fact that A * B can |
|
64 |
* be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather |
|
65 |
* than field addition). The inverse of a field element A (A^-1) is A^254. |
|
66 |
* |
|
67 |
* The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, |
|
68 |
* can be expressed by field operations: |
|
69 |
* |
|
70 |
* P = D_0 + D_1 + ... + D_n-2 + D_n-1 |
|
71 |
* Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 |
|
72 |
* = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 |
|
73 |
* |
|
74 |
* See the reconstruction code below for how P and Q can used individually or |
|
75 |
* in concert to recover missing data columns. |
|
789 | 76 |
*/ |
77 |
||
78 |
typedef struct raidz_col { |
|
2082 | 79 |
uint64_t rc_devidx; /* child device index for I/O */ |
80 |
uint64_t rc_offset; /* device offset */ |
|
81 |
uint64_t rc_size; /* I/O size */ |
|
82 |
void *rc_data; /* I/O data */ |
|
83 |
int rc_error; /* I/O error for this device */ |
|
84 |
uint8_t rc_tried; /* Did we attempt this I/O column? */ |
|
85 |
uint8_t rc_skipped; /* Did we skip this I/O column? */ |
|
789 | 86 |
} raidz_col_t; |
87 |
||
88 |
typedef struct raidz_map { |
|
2082 | 89 |
uint64_t rm_cols; /* Column count */ |
90 |
uint64_t rm_bigcols; /* Number of oversized columns */ |
|
91 |
uint64_t rm_asize; /* Actual total I/O size */ |
|
92 |
uint64_t rm_missingdata; /* Count of missing data devices */ |
|
93 |
uint64_t rm_missingparity; /* Count of missing parity devices */ |
|
94 |
uint64_t rm_firstdatacol; /* First data column/parity count */ |
|
95 |
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ |
|
789 | 96 |
} raidz_map_t; |
97 |
||
2082 | 98 |
#define VDEV_RAIDZ_P 0 |
99 |
#define VDEV_RAIDZ_Q 1 |
|
100 |
||
101 |
#define VDEV_RAIDZ_MAXPARITY 2 |
|
102 |
||
103 |
#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) |
|
104 |
||
105 |
/* |
|
106 |
* These two tables represent powers and logs of 2 in the Galois field defined |
|
107 |
* above. These values were computed by repeatedly multiplying by 2 as above. |
|
108 |
*/ |
|
109 |
static const uint8_t vdev_raidz_pow2[256] = { |
|
110 |
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, |
|
111 |
0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, |
|
112 |
0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, |
|
113 |
0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, |
|
114 |
0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, |
|
115 |
0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, |
|
116 |
0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, |
|
117 |
0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, |
|
118 |
0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, |
|
119 |
0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, |
|
120 |
0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, |
|
121 |
0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, |
|
122 |
0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, |
|
123 |
0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, |
|
124 |
0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, |
|
125 |
0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, |
|
126 |
0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, |
|
127 |
0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, |
|
128 |
0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, |
|
129 |
0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, |
|
130 |
0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, |
|
131 |
0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, |
|
132 |
0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, |
|
133 |
0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, |
|
134 |
0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, |
|
135 |
0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, |
|
136 |
0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, |
|
137 |
0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, |
|
138 |
0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, |
|
139 |
0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, |
|
140 |
0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, |
|
141 |
0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 |
|
142 |
}; |
|
143 |
static const uint8_t vdev_raidz_log2[256] = { |
|
144 |
0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, |
|
145 |
0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, |
|
146 |
0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, |
|
147 |
0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, |
|
148 |
0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, |
|
149 |
0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, |
|
150 |
0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, |
|
151 |
0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, |
|
152 |
0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, |
|
153 |
0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, |
|
154 |
0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, |
|
155 |
0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, |
|
156 |
0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, |
|
157 |
0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, |
|
158 |
0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, |
|
159 |
0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, |
|
160 |
0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, |
|
161 |
0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, |
|
162 |
0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, |
|
163 |
0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, |
|
164 |
0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, |
|
165 |
0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, |
|
166 |
0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, |
|
167 |
0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, |
|
168 |
0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, |
|
169 |
0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, |
|
170 |
0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, |
|
171 |
0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, |
|
172 |
0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, |
|
173 |
0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, |
|
174 |
0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, |
|
175 |
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, |
|
176 |
}; |
|
177 |
||
178 |
/* |
|
179 |
* Multiply a given number by 2 raised to the given power. |
|
180 |
*/ |
|
181 |
static uint8_t |
|
182 |
vdev_raidz_exp2(uint_t a, int exp) |
|
183 |
{ |
|
184 |
if (a == 0) |
|
185 |
return (0); |
|
186 |
||
187 |
ASSERT(exp >= 0); |
|
188 |
ASSERT(vdev_raidz_log2[a] > 0 || a == 1); |
|
189 |
||
190 |
exp += vdev_raidz_log2[a]; |
|
191 |
if (exp > 255) |
|
192 |
exp -= 255; |
|
193 |
||
194 |
return (vdev_raidz_pow2[exp]); |
|
195 |
} |
|
196 |
||
789 | 197 |
static raidz_map_t * |
2082 | 198 |
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, |
199 |
uint64_t nparity) |
|
789 | 200 |
{ |
201 |
raidz_map_t *rm; |
|
202 |
uint64_t b = zio->io_offset >> unit_shift; |
|
203 |
uint64_t s = zio->io_size >> unit_shift; |
|
204 |
uint64_t f = b % dcols; |
|
205 |
uint64_t o = (b / dcols) << unit_shift; |
|
2082 | 206 |
uint64_t q, r, c, bc, col, acols, coff, devidx; |
789 | 207 |
|
2082 | 208 |
q = s / (dcols - nparity); |
209 |
r = s - q * (dcols - nparity); |
|
210 |
bc = (r == 0 ? 0 : r + nparity); |
|
789 | 211 |
|
212 |
acols = (q == 0 ? bc : dcols); |
|
213 |
||
214 |
rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); |
|
215 |
||
216 |
rm->rm_cols = acols; |
|
217 |
rm->rm_bigcols = bc; |
|
218 |
rm->rm_asize = 0; |
|
2082 | 219 |
rm->rm_missingdata = 0; |
220 |
rm->rm_missingparity = 0; |
|
221 |
rm->rm_firstdatacol = nparity; |
|
789 | 222 |
|
223 |
for (c = 0; c < acols; c++) { |
|
224 |
col = f + c; |
|
225 |
coff = o; |
|
226 |
if (col >= dcols) { |
|
227 |
col -= dcols; |
|
228 |
coff += 1ULL << unit_shift; |
|
229 |
} |
|
2082 | 230 |
rm->rm_col[c].rc_devidx = col; |
789 | 231 |
rm->rm_col[c].rc_offset = coff; |
232 |
rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; |
|
233 |
rm->rm_col[c].rc_data = NULL; |
|
234 |
rm->rm_col[c].rc_error = 0; |
|
235 |
rm->rm_col[c].rc_tried = 0; |
|
236 |
rm->rm_col[c].rc_skipped = 0; |
|
237 |
rm->rm_asize += rm->rm_col[c].rc_size; |
|
238 |
} |
|
239 |
||
2082 | 240 |
rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); |
789 | 241 |
|
242 |
for (c = 0; c < rm->rm_firstdatacol; c++) |
|
243 |
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); |
|
244 |
||
245 |
rm->rm_col[c].rc_data = zio->io_data; |
|
246 |
||
247 |
for (c = c + 1; c < acols; c++) |
|
248 |
rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + |
|
249 |
rm->rm_col[c - 1].rc_size; |
|
250 |
||
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
251 |
/* |
2082 | 252 |
* If all data stored spans all columns, there's a danger that parity |
253 |
* will always be on the same device and, since parity isn't read |
|
254 |
* during normal operation, that that device's I/O bandwidth won't be |
|
255 |
* used effectively. We therefore switch the parity every 1MB. |
|
256 |
* |
|
257 |
* ... at least that was, ostensibly, the theory. As a practical |
|
258 |
* matter unless we juggle the parity between all devices evenly, we |
|
259 |
* won't see any benefit. Further, occasional writes that aren't a |
|
260 |
* multiple of the LCM of the number of children and the minimum |
|
261 |
* stripe width are sufficient to avoid pessimal behavior. |
|
262 |
* Unfortunately, this decision created an implicit on-disk format |
|
3456 | 263 |
* requirement that we need to support for all eternity, but only |
264 |
* for single-parity RAID-Z. |
|
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
265 |
*/ |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
266 |
ASSERT(rm->rm_cols >= 2); |
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
267 |
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); |
789 | 268 |
|
2082 | 269 |
if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { |
270 |
devidx = rm->rm_col[0].rc_devidx; |
|
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
271 |
o = rm->rm_col[0].rc_offset; |
2082 | 272 |
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; |
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
273 |
rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; |
2082 | 274 |
rm->rm_col[1].rc_devidx = devidx; |
1133
335d069294d1
6357470 vdev_raidz.c has unused RAIDZ_SINGLE define, code
eschrock
parents:
789
diff
changeset
|
275 |
rm->rm_col[1].rc_offset = o; |
789 | 276 |
} |
277 |
||
278 |
zio->io_vsd = rm; |
|
279 |
return (rm); |
|
280 |
} |
|
281 |
||
282 |
static void |
|
283 |
vdev_raidz_map_free(zio_t *zio) |
|
284 |
{ |
|
285 |
raidz_map_t *rm = zio->io_vsd; |
|
286 |
int c; |
|
287 |
||
288 |
for (c = 0; c < rm->rm_firstdatacol; c++) |
|
289 |
zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); |
|
290 |
||
291 |
kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); |
|
292 |
zio->io_vsd = NULL; |
|
293 |
} |
|
294 |
||
295 |
static void |
|
2082 | 296 |
vdev_raidz_generate_parity_p(raidz_map_t *rm) |
297 |
{ |
|
298 |
uint64_t *p, *src, pcount, ccount, i; |
|
299 |
int c; |
|
300 |
||
301 |
pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); |
|
302 |
||
303 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
304 |
src = rm->rm_col[c].rc_data; |
|
305 |
p = rm->rm_col[VDEV_RAIDZ_P].rc_data; |
|
306 |
ccount = rm->rm_col[c].rc_size / sizeof (src[0]); |
|
307 |
||
308 |
if (c == rm->rm_firstdatacol) { |
|
309 |
ASSERT(ccount == pcount); |
|
310 |
for (i = 0; i < ccount; i++, p++, src++) { |
|
311 |
*p = *src; |
|
312 |
} |
|
313 |
} else { |
|
314 |
ASSERT(ccount <= pcount); |
|
315 |
for (i = 0; i < ccount; i++, p++, src++) { |
|
316 |
*p ^= *src; |
|
317 |
} |
|
318 |
} |
|
319 |
} |
|
320 |
} |
|
321 |
||
322 |
static void |
|
323 |
vdev_raidz_generate_parity_pq(raidz_map_t *rm) |
|
789 | 324 |
{ |
2082 | 325 |
uint64_t *q, *p, *src, pcount, ccount, mask, i; |
326 |
int c; |
|
327 |
||
328 |
pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); |
|
329 |
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == |
|
330 |
rm->rm_col[VDEV_RAIDZ_Q].rc_size); |
|
331 |
||
332 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
333 |
src = rm->rm_col[c].rc_data; |
|
334 |
p = rm->rm_col[VDEV_RAIDZ_P].rc_data; |
|
335 |
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; |
|
336 |
ccount = rm->rm_col[c].rc_size / sizeof (src[0]); |
|
337 |
||
338 |
if (c == rm->rm_firstdatacol) { |
|
339 |
ASSERT(ccount == pcount || ccount == 0); |
|
340 |
for (i = 0; i < ccount; i++, p++, q++, src++) { |
|
341 |
*q = *src; |
|
342 |
*p = *src; |
|
343 |
} |
|
344 |
for (; i < pcount; i++, p++, q++, src++) { |
|
345 |
*q = 0; |
|
346 |
*p = 0; |
|
347 |
} |
|
348 |
} else { |
|
349 |
ASSERT(ccount <= pcount); |
|
789 | 350 |
|
2082 | 351 |
/* |
352 |
* Rather than multiplying each byte individually (as |
|
353 |
* described above), we are able to handle 8 at once |
|
354 |
* by generating a mask based on the high bit in each |
|
355 |
* byte and using that to conditionally XOR in 0x1d. |
|
356 |
*/ |
|
357 |
for (i = 0; i < ccount; i++, p++, q++, src++) { |
|
358 |
mask = *q & 0x8080808080808080ULL; |
|
359 |
mask = (mask << 1) - (mask >> 7); |
|
360 |
*q = ((*q << 1) & 0xfefefefefefefefeULL) ^ |
|
361 |
(mask & 0x1d1d1d1d1d1d1d1dULL); |
|
362 |
*q ^= *src; |
|
363 |
*p ^= *src; |
|
364 |
} |
|
365 |
||
366 |
/* |
|
367 |
* Treat short columns as though they are full of 0s. |
|
368 |
*/ |
|
369 |
for (; i < pcount; i++, q++) { |
|
370 |
mask = *q & 0x8080808080808080ULL; |
|
371 |
mask = (mask << 1) - (mask >> 7); |
|
372 |
*q = ((*q << 1) & 0xfefefefefefefefeULL) ^ |
|
373 |
(mask & 0x1d1d1d1d1d1d1d1dULL); |
|
374 |
} |
|
375 |
} |
|
376 |
} |
|
377 |
} |
|
378 |
||
379 |
static void |
|
380 |
vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) |
|
381 |
{ |
|
382 |
uint64_t *dst, *src, xcount, ccount, count, i; |
|
383 |
int c; |
|
384 |
||
385 |
xcount = rm->rm_col[x].rc_size / sizeof (src[0]); |
|
386 |
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); |
|
387 |
ASSERT(xcount > 0); |
|
388 |
||
389 |
src = rm->rm_col[VDEV_RAIDZ_P].rc_data; |
|
390 |
dst = rm->rm_col[x].rc_data; |
|
391 |
for (i = 0; i < xcount; i++, dst++, src++) { |
|
392 |
*dst = *src; |
|
393 |
} |
|
394 |
||
395 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
789 | 396 |
src = rm->rm_col[c].rc_data; |
397 |
dst = rm->rm_col[x].rc_data; |
|
2082 | 398 |
|
399 |
if (c == x) |
|
400 |
continue; |
|
401 |
||
402 |
ccount = rm->rm_col[c].rc_size / sizeof (src[0]); |
|
403 |
count = MIN(ccount, xcount); |
|
404 |
||
405 |
for (i = 0; i < count; i++, dst++, src++) { |
|
406 |
*dst ^= *src; |
|
789 | 407 |
} |
408 |
} |
|
409 |
} |
|
410 |
||
2082 | 411 |
static void |
412 |
vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) |
|
413 |
{ |
|
414 |
uint64_t *dst, *src, xcount, ccount, count, mask, i; |
|
415 |
uint8_t *b; |
|
416 |
int c, j, exp; |
|
417 |
||
418 |
xcount = rm->rm_col[x].rc_size / sizeof (src[0]); |
|
419 |
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); |
|
420 |
||
421 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
422 |
src = rm->rm_col[c].rc_data; |
|
423 |
dst = rm->rm_col[x].rc_data; |
|
424 |
||
425 |
if (c == x) |
|
426 |
ccount = 0; |
|
427 |
else |
|
428 |
ccount = rm->rm_col[c].rc_size / sizeof (src[0]); |
|
429 |
||
430 |
count = MIN(ccount, xcount); |
|
431 |
||
432 |
if (c == rm->rm_firstdatacol) { |
|
433 |
for (i = 0; i < count; i++, dst++, src++) { |
|
434 |
*dst = *src; |
|
435 |
} |
|
436 |
for (; i < xcount; i++, dst++) { |
|
437 |
*dst = 0; |
|
438 |
} |
|
439 |
||
440 |
} else { |
|
441 |
/* |
|
442 |
* For an explanation of this, see the comment in |
|
443 |
* vdev_raidz_generate_parity_pq() above. |
|
444 |
*/ |
|
445 |
for (i = 0; i < count; i++, dst++, src++) { |
|
446 |
mask = *dst & 0x8080808080808080ULL; |
|
447 |
mask = (mask << 1) - (mask >> 7); |
|
448 |
*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ |
|
449 |
(mask & 0x1d1d1d1d1d1d1d1dULL); |
|
450 |
*dst ^= *src; |
|
451 |
} |
|
452 |
||
453 |
for (; i < xcount; i++, dst++) { |
|
454 |
mask = *dst & 0x8080808080808080ULL; |
|
455 |
mask = (mask << 1) - (mask >> 7); |
|
456 |
*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ |
|
457 |
(mask & 0x1d1d1d1d1d1d1d1dULL); |
|
458 |
} |
|
459 |
} |
|
460 |
} |
|
461 |
||
462 |
src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; |
|
463 |
dst = rm->rm_col[x].rc_data; |
|
464 |
exp = 255 - (rm->rm_cols - 1 - x); |
|
465 |
||
466 |
for (i = 0; i < xcount; i++, dst++, src++) { |
|
467 |
*dst ^= *src; |
|
468 |
for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { |
|
469 |
*b = vdev_raidz_exp2(*b, exp); |
|
470 |
} |
|
471 |
} |
|
472 |
} |
|
473 |
||
474 |
static void |
|
475 |
vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) |
|
476 |
{ |
|
477 |
uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; |
|
478 |
void *pdata, *qdata; |
|
479 |
uint64_t xsize, ysize, i; |
|
480 |
||
481 |
ASSERT(x < y); |
|
482 |
ASSERT(x >= rm->rm_firstdatacol); |
|
483 |
ASSERT(y < rm->rm_cols); |
|
484 |
||
485 |
ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); |
|
486 |
||
487 |
/* |
|
488 |
* Move the parity data aside -- we're going to compute parity as |
|
489 |
* though columns x and y were full of zeros -- Pxy and Qxy. We want to |
|
490 |
* reuse the parity generation mechanism without trashing the actual |
|
491 |
* parity so we make those columns appear to be full of zeros by |
|
492 |
* setting their lengths to zero. |
|
493 |
*/ |
|
494 |
pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; |
|
495 |
qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; |
|
496 |
xsize = rm->rm_col[x].rc_size; |
|
497 |
ysize = rm->rm_col[y].rc_size; |
|
498 |
||
499 |
rm->rm_col[VDEV_RAIDZ_P].rc_data = |
|
500 |
zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); |
|
501 |
rm->rm_col[VDEV_RAIDZ_Q].rc_data = |
|
502 |
zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); |
|
503 |
rm->rm_col[x].rc_size = 0; |
|
504 |
rm->rm_col[y].rc_size = 0; |
|
505 |
||
506 |
vdev_raidz_generate_parity_pq(rm); |
|
507 |
||
508 |
rm->rm_col[x].rc_size = xsize; |
|
509 |
rm->rm_col[y].rc_size = ysize; |
|
510 |
||
511 |
p = pdata; |
|
512 |
q = qdata; |
|
513 |
pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; |
|
514 |
qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; |
|
515 |
xd = rm->rm_col[x].rc_data; |
|
516 |
yd = rm->rm_col[y].rc_data; |
|
517 |
||
518 |
/* |
|
519 |
* We now have: |
|
520 |
* Pxy = P + D_x + D_y |
|
521 |
* Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y |
|
522 |
* |
|
523 |
* We can then solve for D_x: |
|
524 |
* D_x = A * (P + Pxy) + B * (Q + Qxy) |
|
525 |
* where |
|
526 |
* A = 2^(x - y) * (2^(x - y) + 1)^-1 |
|
527 |
* B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 |
|
528 |
* |
|
529 |
* With D_x in hand, we can easily solve for D_y: |
|
530 |
* D_y = P + Pxy + D_x |
|
531 |
*/ |
|
532 |
||
533 |
a = vdev_raidz_pow2[255 + x - y]; |
|
534 |
b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; |
|
535 |
tmp = 255 - vdev_raidz_log2[a ^ 1]; |
|
536 |
||
537 |
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; |
|
538 |
bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; |
|
539 |
||
540 |
for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { |
|
541 |
*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ |
|
542 |
vdev_raidz_exp2(*q ^ *qxy, bexp); |
|
543 |
||
544 |
if (i < ysize) |
|
545 |
*yd = *p ^ *pxy ^ *xd; |
|
546 |
} |
|
547 |
||
548 |
zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, |
|
549 |
rm->rm_col[VDEV_RAIDZ_P].rc_size); |
|
550 |
zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, |
|
551 |
rm->rm_col[VDEV_RAIDZ_Q].rc_size); |
|
552 |
||
553 |
/* |
|
554 |
* Restore the saved parity data. |
|
555 |
*/ |
|
556 |
rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; |
|
557 |
rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; |
|
558 |
} |
|
559 |
||
560 |
||
789 | 561 |
static int |
562 |
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) |
|
563 |
{ |
|
564 |
vdev_t *cvd; |
|
2082 | 565 |
uint64_t nparity = vd->vdev_nparity; |
789 | 566 |
int c, error; |
567 |
int lasterror = 0; |
|
568 |
int numerrors = 0; |
|
569 |
||
2082 | 570 |
ASSERT(nparity > 0); |
571 |
||
572 |
if (nparity > VDEV_RAIDZ_MAXPARITY || |
|
573 |
vd->vdev_children < nparity + 1) { |
|
789 | 574 |
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; |
575 |
return (EINVAL); |
|
576 |
} |
|
577 |
||
578 |
for (c = 0; c < vd->vdev_children; c++) { |
|
579 |
cvd = vd->vdev_child[c]; |
|
580 |
||
581 |
if ((error = vdev_open(cvd)) != 0) { |
|
582 |
lasterror = error; |
|
583 |
numerrors++; |
|
584 |
continue; |
|
585 |
} |
|
586 |
||
587 |
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; |
|
1732 | 588 |
*ashift = MAX(*ashift, cvd->vdev_ashift); |
789 | 589 |
} |
590 |
||
591 |
*asize *= vd->vdev_children; |
|
592 |
||
2082 | 593 |
if (numerrors > nparity) { |
789 | 594 |
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; |
595 |
return (lasterror); |
|
596 |
} |
|
597 |
||
598 |
return (0); |
|
599 |
} |
|
600 |
||
601 |
static void |
|
602 |
vdev_raidz_close(vdev_t *vd) |
|
603 |
{ |
|
604 |
int c; |
|
605 |
||
606 |
for (c = 0; c < vd->vdev_children; c++) |
|
607 |
vdev_close(vd->vdev_child[c]); |
|
608 |
} |
|
609 |
||
610 |
static uint64_t |
|
611 |
vdev_raidz_asize(vdev_t *vd, uint64_t psize) |
|
612 |
{ |
|
613 |
uint64_t asize; |
|
1732 | 614 |
uint64_t ashift = vd->vdev_top->vdev_ashift; |
789 | 615 |
uint64_t cols = vd->vdev_children; |
2082 | 616 |
uint64_t nparity = vd->vdev_nparity; |
789 | 617 |
|
1732 | 618 |
asize = ((psize - 1) >> ashift) + 1; |
2082 | 619 |
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); |
620 |
asize = roundup(asize, nparity + 1) << ashift; |
|
789 | 621 |
|
622 |
return (asize); |
|
623 |
} |
|
624 |
||
625 |
static void |
|
626 |
vdev_raidz_child_done(zio_t *zio) |
|
627 |
{ |
|
628 |
raidz_col_t *rc = zio->io_private; |
|
629 |
||
630 |
rc->rc_error = zio->io_error; |
|
631 |
rc->rc_tried = 1; |
|
632 |
rc->rc_skipped = 0; |
|
633 |
} |
|
634 |
||
635 |
static void |
|
636 |
vdev_raidz_repair_done(zio_t *zio) |
|
637 |
{ |
|
1732 | 638 |
ASSERT(zio->io_private == zio->io_parent); |
639 |
vdev_raidz_map_free(zio->io_private); |
|
789 | 640 |
} |
641 |
||
642 |
static void |
|
643 |
vdev_raidz_io_start(zio_t *zio) |
|
644 |
{ |
|
645 |
vdev_t *vd = zio->io_vd; |
|
1732 | 646 |
vdev_t *tvd = vd->vdev_top; |
789 | 647 |
vdev_t *cvd; |
648 |
blkptr_t *bp = zio->io_bp; |
|
649 |
raidz_map_t *rm; |
|
650 |
raidz_col_t *rc; |
|
651 |
int c; |
|
652 |
||
2082 | 653 |
rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, |
654 |
vd->vdev_nparity); |
|
789 | 655 |
|
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1732
diff
changeset
|
656 |
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); |
789 | 657 |
|
658 |
if (zio->io_type == ZIO_TYPE_WRITE) { |
|
659 |
/* |
|
2082 | 660 |
* Generate RAID parity in the first virtual columns. |
789 | 661 |
*/ |
2082 | 662 |
if (rm->rm_firstdatacol == 1) |
663 |
vdev_raidz_generate_parity_p(rm); |
|
664 |
else |
|
665 |
vdev_raidz_generate_parity_pq(rm); |
|
789 | 666 |
|
667 |
for (c = 0; c < rm->rm_cols; c++) { |
|
668 |
rc = &rm->rm_col[c]; |
|
2082 | 669 |
cvd = vd->vdev_child[rc->rc_devidx]; |
789 | 670 |
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, |
671 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
|
672 |
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, |
|
673 |
vdev_raidz_child_done, rc)); |
|
674 |
} |
|
675 |
zio_wait_children_done(zio); |
|
676 |
return; |
|
677 |
} |
|
678 |
||
679 |
ASSERT(zio->io_type == ZIO_TYPE_READ); |
|
680 |
||
2082 | 681 |
/* |
682 |
* Iterate over the columns in reverse order so that we hit the parity |
|
683 |
* last -- any errors along the way will force us to read the parity |
|
684 |
* data. |
|
685 |
*/ |
|
789 | 686 |
for (c = rm->rm_cols - 1; c >= 0; c--) { |
687 |
rc = &rm->rm_col[c]; |
|
2082 | 688 |
cvd = vd->vdev_child[rc->rc_devidx]; |
5329 | 689 |
if (!vdev_readable(cvd)) { |
2082 | 690 |
if (c >= rm->rm_firstdatacol) |
691 |
rm->rm_missingdata++; |
|
692 |
else |
|
693 |
rm->rm_missingparity++; |
|
789 | 694 |
rc->rc_error = ENXIO; |
695 |
rc->rc_tried = 1; /* don't even try */ |
|
696 |
rc->rc_skipped = 1; |
|
697 |
continue; |
|
698 |
} |
|
699 |
if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { |
|
2082 | 700 |
if (c >= rm->rm_firstdatacol) |
701 |
rm->rm_missingdata++; |
|
702 |
else |
|
703 |
rm->rm_missingparity++; |
|
789 | 704 |
rc->rc_error = ESTALE; |
705 |
rc->rc_skipped = 1; |
|
706 |
continue; |
|
707 |
} |
|
2082 | 708 |
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || |
789 | 709 |
(zio->io_flags & ZIO_FLAG_SCRUB)) { |
710 |
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, |
|
711 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
|
712 |
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, |
|
713 |
vdev_raidz_child_done, rc)); |
|
714 |
} |
|
715 |
} |
|
716 |
||
717 |
zio_wait_children_done(zio); |
|
718 |
} |
|
719 |
||
1544 | 720 |
/* |
721 |
* Report a checksum error for a child of a RAID-Z device. |
|
722 |
*/ |
|
723 |
static void |
|
724 |
raidz_checksum_error(zio_t *zio, raidz_col_t *rc) |
|
725 |
{ |
|
2082 | 726 |
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; |
1544 | 727 |
dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", |
728 |
vdev_description(vd)); |
|
729 |
||
730 |
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { |
|
731 |
mutex_enter(&vd->vdev_stat_lock); |
|
732 |
vd->vdev_stat.vs_checksum_errors++; |
|
733 |
mutex_exit(&vd->vdev_stat_lock); |
|
734 |
} |
|
735 |
||
736 |
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) |
|
737 |
zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, |
|
738 |
zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); |
|
739 |
} |
|
740 |
||
2082 | 741 |
/* |
742 |
* Generate the parity from the data columns. If we tried and were able to |
|
743 |
* read the parity without error, verify that the generated parity matches the |
|
744 |
* data we read. If it doesn't, we fire off a checksum error. Return the |
|
745 |
* number such failures. |
|
746 |
*/ |
|
747 |
static int |
|
748 |
raidz_parity_verify(zio_t *zio, raidz_map_t *rm) |
|
749 |
{ |
|
750 |
void *orig[VDEV_RAIDZ_MAXPARITY]; |
|
751 |
int c, ret = 0; |
|
752 |
raidz_col_t *rc; |
|
753 |
||
754 |
for (c = 0; c < rm->rm_firstdatacol; c++) { |
|
755 |
rc = &rm->rm_col[c]; |
|
756 |
if (!rc->rc_tried || rc->rc_error != 0) |
|
757 |
continue; |
|
758 |
orig[c] = zio_buf_alloc(rc->rc_size); |
|
759 |
bcopy(rc->rc_data, orig[c], rc->rc_size); |
|
760 |
} |
|
761 |
||
762 |
if (rm->rm_firstdatacol == 1) |
|
763 |
vdev_raidz_generate_parity_p(rm); |
|
764 |
else |
|
765 |
vdev_raidz_generate_parity_pq(rm); |
|
766 |
||
767 |
for (c = 0; c < rm->rm_firstdatacol; c++) { |
|
768 |
rc = &rm->rm_col[c]; |
|
769 |
if (!rc->rc_tried || rc->rc_error != 0) |
|
770 |
continue; |
|
771 |
if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { |
|
772 |
raidz_checksum_error(zio, rc); |
|
773 |
rc->rc_error = ECKSUM; |
|
774 |
ret++; |
|
775 |
} |
|
776 |
zio_buf_free(orig[c], rc->rc_size); |
|
777 |
} |
|
778 |
||
779 |
return (ret); |
|
780 |
} |
|
781 |
||
782 |
static uint64_t raidz_corrected_p; |
|
783 |
static uint64_t raidz_corrected_q; |
|
784 |
static uint64_t raidz_corrected_pq; |
|
1544 | 785 |
|
789 | 786 |
static void |
787 |
vdev_raidz_io_done(zio_t *zio) |
|
788 |
{ |
|
789 |
vdev_t *vd = zio->io_vd; |
|
790 |
vdev_t *cvd; |
|
791 |
raidz_map_t *rm = zio->io_vsd; |
|
2082 | 792 |
raidz_col_t *rc, *rc1; |
789 | 793 |
int unexpected_errors = 0; |
2082 | 794 |
int parity_errors = 0; |
3456 | 795 |
int parity_untried = 0; |
2082 | 796 |
int data_errors = 0; |
797 |
int n, c, c1; |
|
789 | 798 |
|
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1732
diff
changeset
|
799 |
ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ |
789 | 800 |
|
801 |
zio->io_error = 0; |
|
802 |
zio->io_numerrors = 0; |
|
803 |
||
2082 | 804 |
ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); |
805 |
ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); |
|
806 |
||
789 | 807 |
for (c = 0; c < rm->rm_cols; c++) { |
808 |
rc = &rm->rm_col[c]; |
|
809 |
||
810 |
/* |
|
811 |
* We preserve any EIOs because those may be worth retrying; |
|
812 |
* whereas ECKSUM and ENXIO are more likely to be persistent. |
|
813 |
*/ |
|
814 |
if (rc->rc_error) { |
|
815 |
if (zio->io_error != EIO) |
|
816 |
zio->io_error = rc->rc_error; |
|
2082 | 817 |
|
818 |
if (c < rm->rm_firstdatacol) |
|
819 |
parity_errors++; |
|
820 |
else |
|
821 |
data_errors++; |
|
822 |
||
789 | 823 |
if (!rc->rc_skipped) |
824 |
unexpected_errors++; |
|
2082 | 825 |
|
789 | 826 |
zio->io_numerrors++; |
3456 | 827 |
} else if (c < rm->rm_firstdatacol && !rc->rc_tried) { |
828 |
parity_untried++; |
|
789 | 829 |
} |
830 |
} |
|
831 |
||
832 |
if (zio->io_type == ZIO_TYPE_WRITE) { |
|
833 |
/* |
|
834 |
* If this is not a failfast write, and we were able to |
|
835 |
* write enough columns to reconstruct the data, good enough. |
|
836 |
*/ |
|
837 |
/* XXPOLICY */ |
|
838 |
if (zio->io_numerrors <= rm->rm_firstdatacol && |
|
839 |
!(zio->io_flags & ZIO_FLAG_FAILFAST)) |
|
840 |
zio->io_error = 0; |
|
841 |
||
842 |
vdev_raidz_map_free(zio); |
|
843 |
zio_next_stage(zio); |
|
844 |
return; |
|
845 |
} |
|
846 |
||
847 |
ASSERT(zio->io_type == ZIO_TYPE_READ); |
|
2082 | 848 |
/* |
849 |
* There are three potential phases for a read: |
|
850 |
* 1. produce valid data from the columns read |
|
851 |
* 2. read all disks and try again |
|
852 |
* 3. perform combinatorial reconstruction |
|
853 |
* |
|
854 |
* Each phase is progressively both more expensive and less likely to |
|
855 |
* occur. If we encounter more errors than we can repair or all phases |
|
856 |
* fail, we have no choice but to return an error. |
|
857 |
*/ |
|
789 | 858 |
|
859 |
/* |
|
2082 | 860 |
* If the number of errors we saw was correctable -- less than or equal |
3456 | 861 |
* to the number of parity disks read -- attempt to produce data that |
862 |
* has a valid checksum. Naturally, this case applies in the absence of |
|
863 |
* any errors. |
|
789 | 864 |
*/ |
3456 | 865 |
if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { |
2082 | 866 |
switch (data_errors) { |
867 |
case 0: |
|
868 |
if (zio_checksum_error(zio) == 0) { |
|
869 |
zio->io_error = 0; |
|
4034 | 870 |
|
871 |
/* |
|
872 |
* If we read parity information (unnecessarily |
|
873 |
* as it happens since no reconstruction was |
|
874 |
* needed) regenerate and verify the parity. |
|
875 |
* We also regenerate parity when resilvering |
|
876 |
* so we can write it out to the failed device |
|
877 |
* later. |
|
878 |
*/ |
|
3456 | 879 |
if (parity_errors + parity_untried < |
4034 | 880 |
rm->rm_firstdatacol || |
881 |
(zio->io_flags & ZIO_FLAG_RESILVER)) { |
|
3456 | 882 |
n = raidz_parity_verify(zio, rm); |
883 |
unexpected_errors += n; |
|
884 |
ASSERT(parity_errors + n <= |
|
885 |
rm->rm_firstdatacol); |
|
886 |
} |
|
2082 | 887 |
goto done; |
888 |
} |
|
889 |
break; |
|
890 |
||
891 |
case 1: |
|
3456 | 892 |
/* |
893 |
* We either attempt to read all the parity columns or |
|
894 |
* none of them. If we didn't try to read parity, we |
|
895 |
* wouldn't be here in the correctable case. There must |
|
896 |
* also have been fewer parity errors than parity |
|
897 |
* columns or, again, we wouldn't be in this code path. |
|
898 |
*/ |
|
899 |
ASSERT(parity_untried == 0); |
|
2082 | 900 |
ASSERT(parity_errors < rm->rm_firstdatacol); |
901 |
||
902 |
/* |
|
903 |
* Find the column that reported the error. |
|
904 |
*/ |
|
905 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
906 |
rc = &rm->rm_col[c]; |
|
907 |
if (rc->rc_error != 0) |
|
908 |
break; |
|
909 |
} |
|
910 |
ASSERT(c != rm->rm_cols); |
|
911 |
ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || |
|
912 |
rc->rc_error == ESTALE); |
|
913 |
||
914 |
if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { |
|
915 |
vdev_raidz_reconstruct_p(rm, c); |
|
916 |
} else { |
|
917 |
ASSERT(rm->rm_firstdatacol > 1); |
|
918 |
vdev_raidz_reconstruct_q(rm, c); |
|
919 |
} |
|
920 |
||
921 |
if (zio_checksum_error(zio) == 0) { |
|
922 |
zio->io_error = 0; |
|
923 |
if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) |
|
924 |
atomic_inc_64(&raidz_corrected_p); |
|
925 |
else |
|
926 |
atomic_inc_64(&raidz_corrected_q); |
|
789 | 927 |
|
2082 | 928 |
/* |
3456 | 929 |
* If there's more than one parity disk that |
930 |
* was successfully read, confirm that the |
|
931 |
* other parity disk produced the correct data. |
|
932 |
* This routine is suboptimal in that it |
|
933 |
* regenerates both the parity we wish to test |
|
934 |
* as well as the parity we just used to |
|
935 |
* perform the reconstruction, but this should |
|
936 |
* be a relatively uncommon case, and can be |
|
937 |
* optimized if it becomes a problem. |
|
4034 | 938 |
* We also regenerate parity when resilvering |
939 |
* so we can write it out to the failed device |
|
940 |
* later. |
|
2082 | 941 |
*/ |
4034 | 942 |
if (parity_errors < rm->rm_firstdatacol - 1 || |
943 |
(zio->io_flags & ZIO_FLAG_RESILVER)) { |
|
2082 | 944 |
n = raidz_parity_verify(zio, rm); |
945 |
unexpected_errors += n; |
|
946 |
ASSERT(parity_errors + n <= |
|
947 |
rm->rm_firstdatacol); |
|
948 |
} |
|
949 |
||
950 |
goto done; |
|
951 |
} |
|
952 |
break; |
|
953 |
||
954 |
case 2: |
|
955 |
/* |
|
3456 | 956 |
* Two data column errors require double parity. |
957 |
*/ |
|
958 |
ASSERT(rm->rm_firstdatacol == 2); |
|
959 |
||
960 |
/* |
|
2082 | 961 |
* Find the two columns that reported errors. |
962 |
*/ |
|
963 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
964 |
rc = &rm->rm_col[c]; |
|
965 |
if (rc->rc_error != 0) |
|
966 |
break; |
|
789 | 967 |
} |
2082 | 968 |
ASSERT(c != rm->rm_cols); |
969 |
ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || |
|
970 |
rc->rc_error == ESTALE); |
|
971 |
||
972 |
for (c1 = c++; c < rm->rm_cols; c++) { |
|
973 |
rc = &rm->rm_col[c]; |
|
974 |
if (rc->rc_error != 0) |
|
975 |
break; |
|
976 |
} |
|
977 |
ASSERT(c != rm->rm_cols); |
|
978 |
ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || |
|
979 |
rc->rc_error == ESTALE); |
|
789 | 980 |
|
2082 | 981 |
vdev_raidz_reconstruct_pq(rm, c1, c); |
982 |
||
983 |
if (zio_checksum_error(zio) == 0) { |
|
984 |
zio->io_error = 0; |
|
985 |
atomic_inc_64(&raidz_corrected_pq); |
|
986 |
||
987 |
goto done; |
|
988 |
} |
|
989 |
break; |
|
990 |
||
991 |
default: |
|
992 |
ASSERT(rm->rm_firstdatacol <= 2); |
|
993 |
ASSERT(0); |
|
789 | 994 |
} |
995 |
} |
|
996 |
||
997 |
/* |
|
2082 | 998 |
* This isn't a typical situation -- either we got a read error or |
999 |
* a child silently returned bad data. Read every block so we can |
|
1000 |
* try again with as much data and parity as we can track down. If |
|
1001 |
* we've already been through once before, all children will be marked |
|
1002 |
* as tried so we'll proceed to combinatorial reconstruction. |
|
789 | 1003 |
*/ |
1004 |
unexpected_errors = 1; |
|
2082 | 1005 |
rm->rm_missingdata = 0; |
1006 |
rm->rm_missingparity = 0; |
|
789 | 1007 |
|
2082 | 1008 |
for (c = 0; c < rm->rm_cols; c++) { |
1009 |
if (rm->rm_col[c].rc_tried) |
|
1010 |
continue; |
|
789 | 1011 |
|
1012 |
zio->io_error = 0; |
|
1013 |
zio_vdev_io_redone(zio); |
|
2082 | 1014 |
do { |
789 | 1015 |
rc = &rm->rm_col[c]; |
1016 |
if (rc->rc_tried) |
|
1017 |
continue; |
|
1018 |
zio_nowait(zio_vdev_child_io(zio, NULL, |
|
2082 | 1019 |
vd->vdev_child[rc->rc_devidx], |
789 | 1020 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
1021 |
zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, |
|
1022 |
vdev_raidz_child_done, rc)); |
|
2082 | 1023 |
} while (++c < rm->rm_cols); |
1024 |
dprintf("rereading\n"); |
|
789 | 1025 |
zio_wait_children_done(zio); |
1026 |
return; |
|
1027 |
} |
|
1028 |
||
1029 |
/* |
|
2082 | 1030 |
* At this point we've attempted to reconstruct the data given the |
1031 |
* errors we detected, and we've attempted to read all columns. There |
|
1032 |
* must, therefore, be one or more additional problems -- silent errors |
|
1033 |
* resulting in invalid data rather than explicit I/O errors resulting |
|
1034 |
* in absent data. Before we attempt combinatorial reconstruction make |
|
1035 |
* sure we have a chance of coming up with the right answer. |
|
789 | 1036 |
*/ |
2082 | 1037 |
if (zio->io_numerrors >= rm->rm_firstdatacol) { |
789 | 1038 |
ASSERT(zio->io_error != 0); |
1039 |
goto done; |
|
1040 |
} |
|
1041 |
||
2082 | 1042 |
if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { |
1043 |
/* |
|
1044 |
* Attempt to reconstruct the data from parity P. |
|
1045 |
*/ |
|
1046 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
1047 |
void *orig; |
|
1048 |
rc = &rm->rm_col[c]; |
|
1049 |
||
1050 |
orig = zio_buf_alloc(rc->rc_size); |
|
1051 |
bcopy(rc->rc_data, orig, rc->rc_size); |
|
1052 |
vdev_raidz_reconstruct_p(rm, c); |
|
1053 |
||
1054 |
if (zio_checksum_error(zio) == 0) { |
|
1055 |
zio_buf_free(orig, rc->rc_size); |
|
1056 |
zio->io_error = 0; |
|
1057 |
atomic_inc_64(&raidz_corrected_p); |
|
1058 |
||
1059 |
/* |
|
1060 |
* If this child didn't know that it returned |
|
1061 |
* bad data, inform it. |
|
1062 |
*/ |
|
1063 |
if (rc->rc_tried && rc->rc_error == 0) |
|
1064 |
raidz_checksum_error(zio, rc); |
|
1065 |
rc->rc_error = ECKSUM; |
|
1066 |
goto done; |
|
1067 |
} |
|
1068 |
||
1069 |
bcopy(orig, rc->rc_data, rc->rc_size); |
|
1070 |
zio_buf_free(orig, rc->rc_size); |
|
1071 |
} |
|
1072 |
} |
|
1073 |
||
1074 |
if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { |
|
1075 |
/* |
|
1076 |
* Attempt to reconstruct the data from parity Q. |
|
1077 |
*/ |
|
1078 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { |
|
1079 |
void *orig; |
|
1080 |
rc = &rm->rm_col[c]; |
|
1081 |
||
1082 |
orig = zio_buf_alloc(rc->rc_size); |
|
1083 |
bcopy(rc->rc_data, orig, rc->rc_size); |
|
1084 |
vdev_raidz_reconstruct_q(rm, c); |
|
1085 |
||
1086 |
if (zio_checksum_error(zio) == 0) { |
|
1087 |
zio_buf_free(orig, rc->rc_size); |
|
789 | 1088 |
zio->io_error = 0; |
2082 | 1089 |
atomic_inc_64(&raidz_corrected_q); |
1090 |
||
1091 |
/* |
|
1092 |
* If this child didn't know that it returned |
|
1093 |
* bad data, inform it. |
|
1094 |
*/ |
|
1095 |
if (rc->rc_tried && rc->rc_error == 0) |
|
1096 |
raidz_checksum_error(zio, rc); |
|
1097 |
rc->rc_error = ECKSUM; |
|
1098 |
goto done; |
|
1099 |
} |
|
1100 |
||
1101 |
bcopy(orig, rc->rc_data, rc->rc_size); |
|
1102 |
zio_buf_free(orig, rc->rc_size); |
|
1103 |
} |
|
1104 |
} |
|
1105 |
||
1106 |
if (rm->rm_firstdatacol > 1 && |
|
1107 |
rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && |
|
1108 |
rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { |
|
1109 |
/* |
|
1110 |
* Attempt to reconstruct the data from both P and Q. |
|
1111 |
*/ |
|
1112 |
for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { |
|
1113 |
void *orig, *orig1; |
|
1114 |
rc = &rm->rm_col[c]; |
|
1115 |
||
1116 |
orig = zio_buf_alloc(rc->rc_size); |
|
1117 |
bcopy(rc->rc_data, orig, rc->rc_size); |
|
1118 |
||
1119 |
for (c1 = c + 1; c1 < rm->rm_cols; c1++) { |
|
1120 |
rc1 = &rm->rm_col[c1]; |
|
1121 |
||
1122 |
orig1 = zio_buf_alloc(rc1->rc_size); |
|
1123 |
bcopy(rc1->rc_data, orig1, rc1->rc_size); |
|
1124 |
||
1125 |
vdev_raidz_reconstruct_pq(rm, c, c1); |
|
1126 |
||
1127 |
if (zio_checksum_error(zio) == 0) { |
|
1128 |
zio_buf_free(orig, rc->rc_size); |
|
1129 |
zio_buf_free(orig1, rc1->rc_size); |
|
1130 |
zio->io_error = 0; |
|
1131 |
atomic_inc_64(&raidz_corrected_pq); |
|
1132 |
||
1133 |
/* |
|
1134 |
* If these children didn't know they |
|
1135 |
* returned bad data, inform them. |
|
1136 |
*/ |
|
1137 |
if (rc->rc_tried && rc->rc_error == 0) |
|
1138 |
raidz_checksum_error(zio, rc); |
|
1139 |
if (rc1->rc_tried && rc1->rc_error == 0) |
|
1140 |
raidz_checksum_error(zio, rc1); |
|
1141 |
||
1142 |
rc->rc_error = ECKSUM; |
|
1143 |
rc1->rc_error = ECKSUM; |
|
1144 |
||
1145 |
goto done; |
|
1146 |
} |
|
1147 |
||
1148 |
bcopy(orig1, rc1->rc_data, rc1->rc_size); |
|
1149 |
zio_buf_free(orig1, rc1->rc_size); |
|
1150 |
} |
|
1151 |
||
1152 |
bcopy(orig, rc->rc_data, rc->rc_size); |
|
1153 |
zio_buf_free(orig, rc->rc_size); |
|
789 | 1154 |
} |
1155 |
} |
|
1156 |
||
1157 |
/* |
|
2082 | 1158 |
* All combinations failed to checksum. Generate checksum ereports for |
1159 |
* all children. |
|
789 | 1160 |
*/ |
1161 |
zio->io_error = ECKSUM; |
|
1544 | 1162 |
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { |
1163 |
for (c = 0; c < rm->rm_cols; c++) { |
|
1164 |
rc = &rm->rm_col[c]; |
|
1165 |
zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, |
|
2082 | 1166 |
zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, |
1544 | 1167 |
rc->rc_offset, rc->rc_size); |
1168 |
} |
|
1169 |
} |
|
789 | 1170 |
|
1171 |
done: |
|
1172 |
zio_checksum_verified(zio); |
|
1173 |
||
1174 |
if (zio->io_error == 0 && (spa_mode & FWRITE) && |
|
1175 |
(unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { |
|
1732 | 1176 |
zio_t *rio; |
1177 |
||
789 | 1178 |
/* |
1179 |
* Use the good data we have in hand to repair damaged children. |
|
1732 | 1180 |
* |
1181 |
* We issue all repair I/Os as children of 'rio' to arrange |
|
1182 |
* that vdev_raidz_map_free(zio) will be invoked after all |
|
1183 |
* repairs complete, but before we advance to the next stage. |
|
789 | 1184 |
*/ |
1732 | 1185 |
rio = zio_null(zio, zio->io_spa, |
1186 |
vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); |
|
1187 |
||
789 | 1188 |
for (c = 0; c < rm->rm_cols; c++) { |
1189 |
rc = &rm->rm_col[c]; |
|
2082 | 1190 |
cvd = vd->vdev_child[rc->rc_devidx]; |
789 | 1191 |
|
1732 | 1192 |
if (rc->rc_error == 0) |
1193 |
continue; |
|
1194 |
||
1195 |
dprintf("%s resilvered %s @ 0x%llx error %d\n", |
|
1196 |
vdev_description(vd), |
|
1197 |
vdev_description(cvd), |
|
1198 |
zio->io_offset, rc->rc_error); |
|
789 | 1199 |
|
1732 | 1200 |
zio_nowait(zio_vdev_child_io(rio, NULL, cvd, |
1201 |
rc->rc_offset, rc->rc_data, rc->rc_size, |
|
1202 |
ZIO_TYPE_WRITE, zio->io_priority, |
|
2082 | 1203 |
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | |
1204 |
ZIO_FLAG_CANFAIL, NULL, NULL)); |
|
1732 | 1205 |
} |
789 | 1206 |
|
1732 | 1207 |
zio_nowait(rio); |
1208 |
zio_wait_children_done(zio); |
|
1209 |
return; |
|
789 | 1210 |
} |
1211 |
||
1212 |
vdev_raidz_map_free(zio); |
|
1213 |
zio_next_stage(zio); |
|
1214 |
} |
|
1215 |
||
1216 |
static void |
|
1217 |
vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) |
|
1218 |
{ |
|
2082 | 1219 |
if (faulted > vd->vdev_nparity) |
1544 | 1220 |
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, |
1221 |
VDEV_AUX_NO_REPLICAS); |
|
789 | 1222 |
else if (degraded + faulted != 0) |
1544 | 1223 |
vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); |
789 | 1224 |
else |
1544 | 1225 |
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); |
789 | 1226 |
} |
1227 |
||
1228 |
vdev_ops_t vdev_raidz_ops = { |
|
1229 |
vdev_raidz_open, |
|
1230 |
vdev_raidz_close, |
|
5329 | 1231 |
NULL, |
789 | 1232 |
vdev_raidz_asize, |
1233 |
vdev_raidz_io_start, |
|
1234 |
vdev_raidz_io_done, |
|
1235 |
vdev_raidz_state_change, |
|
1236 |
VDEV_TYPE_RAIDZ, /* name of this vdev type */ |
|
1237 |
B_FALSE /* not a leaf vdev */ |
|
1238 |
}; |