author | lling |
Wed, 19 Sep 2007 10:32:40 -0700 | |
changeset 5094 | 71a3e95fb9e2 |
parent 4451 | 24fbf2d7a5d7 |
child 6523 | c1d2a7f04573 |
permissions | -rw-r--r-- |
1544 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
5 |
* Common Development and Distribution License (the "License"). |
|
6 |
* You may not use this file except in compliance with the License. |
|
7 |
* |
|
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
21 |
/* |
|
4451 | 22 |
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. |
1544 | 23 |
* Use is subject to license terms. |
24 |
*/ |
|
25 |
||
26 |
#pragma ident "%Z%%M% %I% %E% SMI" |
|
27 |
||
28 |
#include <assert.h> |
|
29 |
#include <stddef.h> |
|
30 |
#include <strings.h> |
|
31 |
#include <libuutil.h> |
|
4451 | 32 |
#include <libzfs.h> |
1544 | 33 |
#include <fm/fmd_api.h> |
34 |
#include <sys/fs/zfs.h> |
|
35 |
#include <sys/fm/protocol.h> |
|
36 |
#include <sys/fm/fs/zfs.h> |
|
37 |
||
4451 | 38 |
/* |
39 |
* Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This |
|
40 |
* #define reserves enough space for two 64-bit hex values plus the length of |
|
41 |
* the longest string. |
|
42 |
*/ |
|
43 |
#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) |
|
44 |
||
1544 | 45 |
typedef struct zfs_case_data { |
46 |
uint64_t zc_version; |
|
47 |
uint64_t zc_ena; |
|
48 |
uint64_t zc_pool_guid; |
|
49 |
uint64_t zc_vdev_guid; |
|
50 |
int zc_has_timer; |
|
51 |
int zc_pool_state; |
|
4451 | 52 |
char zc_serd_checksum[MAX_SERDLEN]; |
53 |
char zc_serd_io[MAX_SERDLEN]; |
|
54 |
int zc_has_serd_timer; |
|
1544 | 55 |
} zfs_case_data_t; |
56 |
||
57 |
typedef struct zfs_case { |
|
4451 | 58 |
boolean_t zc_present; |
59 |
uint32_t zc_version; |
|
1544 | 60 |
zfs_case_data_t zc_data; |
61 |
fmd_case_t *zc_case; |
|
62 |
uu_list_node_t zc_node; |
|
63 |
id_t zc_timer; |
|
4451 | 64 |
id_t zc_serd_timer; |
1544 | 65 |
} zfs_case_t; |
66 |
||
4451 | 67 |
#define CASE_DATA "data" |
68 |
#define CASE_DATA_VERSION_INITIAL 1 |
|
69 |
#define CASE_DATA_VERSION_SERD 2 |
|
1544 | 70 |
|
4451 | 71 |
static hrtime_t zfs_case_timeout; |
72 |
static hrtime_t zfs_serd_timeout; |
|
1544 | 73 |
|
74 |
uu_list_pool_t *zfs_case_pool; |
|
75 |
uu_list_t *zfs_cases; |
|
76 |
||
4451 | 77 |
/* |
78 |
* Write out the persistent representation of an active case. |
|
79 |
*/ |
|
1544 | 80 |
static void |
81 |
zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) |
|
82 |
{ |
|
4451 | 83 |
/* |
84 |
* Always update cases to the latest version, even if they were the |
|
85 |
* previous version when unserialized. |
|
86 |
*/ |
|
87 |
zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; |
|
1544 | 88 |
fmd_buf_write(hdl, zcp->zc_case, CASE_DATA, &zcp->zc_data, |
89 |
sizeof (zcp->zc_data)); |
|
90 |
} |
|
91 |
||
4451 | 92 |
/* |
93 |
* Read back the persistent representation of an active case. |
|
94 |
*/ |
|
1544 | 95 |
static zfs_case_t * |
96 |
zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) |
|
97 |
{ |
|
98 |
zfs_case_t *zcp; |
|
99 |
||
100 |
zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); |
|
101 |
zcp->zc_case = cp; |
|
102 |
||
103 |
fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, |
|
104 |
sizeof (zcp->zc_data)); |
|
105 |
||
4451 | 106 |
if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { |
1544 | 107 |
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); |
108 |
return (NULL); |
|
109 |
} |
|
110 |
||
4451 | 111 |
/* |
112 |
* fmd_buf_read() will have already zeroed out the remainder of the |
|
113 |
* buffer, so we don't have to do anything special if the version |
|
114 |
* doesn't include the SERD engine name. |
|
115 |
*/ |
|
116 |
||
1544 | 117 |
if (zcp->zc_data.zc_has_timer) |
118 |
zcp->zc_timer = fmd_timer_install(hdl, zcp, |
|
119 |
NULL, zfs_case_timeout); |
|
4451 | 120 |
if (zcp->zc_data.zc_has_serd_timer) |
121 |
zcp->zc_serd_timer = fmd_timer_install(hdl, zcp, |
|
122 |
NULL, zfs_serd_timeout); |
|
1544 | 123 |
|
124 |
(void) uu_list_insert_before(zfs_cases, NULL, zcp); |
|
125 |
||
126 |
fmd_case_setspecific(hdl, cp, zcp); |
|
127 |
||
128 |
return (zcp); |
|
129 |
} |
|
130 |
||
4451 | 131 |
/* |
132 |
* Iterate over any active cases. If any cases are associated with a pool or |
|
133 |
* vdev which is no longer present on the system, close the associated case. |
|
134 |
*/ |
|
135 |
static void |
|
136 |
zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd) |
|
137 |
{ |
|
138 |
uint64_t vdev_guid; |
|
139 |
uint_t c, children; |
|
140 |
nvlist_t **child; |
|
141 |
zfs_case_t *zcp; |
|
142 |
int ret; |
|
143 |
||
144 |
ret = nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); |
|
145 |
assert(ret == 0); |
|
146 |
||
147 |
/* |
|
148 |
* Mark any cases associated with this (pool, vdev) pair. |
|
149 |
*/ |
|
150 |
for (zcp = uu_list_first(zfs_cases); zcp != NULL; |
|
151 |
zcp = uu_list_next(zfs_cases, zcp)) { |
|
152 |
if (zcp->zc_data.zc_pool_guid == pool_guid && |
|
153 |
zcp->zc_data.zc_vdev_guid == vdev_guid) |
|
154 |
zcp->zc_present = B_TRUE; |
|
155 |
} |
|
156 |
||
157 |
/* |
|
158 |
* Iterate over all children. |
|
159 |
*/ |
|
160 |
if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, |
|
161 |
&children) != 0) { |
|
162 |
for (c = 0; c < children; c++) |
|
163 |
zfs_mark_vdev(pool_guid, child[c]); |
|
164 |
} |
|
165 |
} |
|
166 |
||
167 |
/*ARGSUSED*/ |
|
168 |
static int |
|
169 |
zfs_mark_pool(zpool_handle_t *zhp, void *unused) |
|
170 |
{ |
|
171 |
zfs_case_t *zcp; |
|
5094 | 172 |
uint64_t pool_guid; |
4451 | 173 |
nvlist_t *config, *vd; |
174 |
int ret; |
|
175 |
||
5094 | 176 |
pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); |
4451 | 177 |
/* |
178 |
* Mark any cases associated with just this pool. |
|
179 |
*/ |
|
180 |
for (zcp = uu_list_first(zfs_cases); zcp != NULL; |
|
181 |
zcp = uu_list_next(zfs_cases, zcp)) { |
|
182 |
if (zcp->zc_data.zc_pool_guid == pool_guid && |
|
183 |
zcp->zc_data.zc_vdev_guid == 0) |
|
184 |
zcp->zc_present = B_TRUE; |
|
185 |
} |
|
186 |
||
187 |
if ((config = zpool_get_config(zhp, NULL)) == NULL) |
|
188 |
return (-1); |
|
189 |
||
190 |
ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); |
|
191 |
assert(ret == 0); |
|
192 |
||
193 |
zfs_mark_vdev(pool_guid, vd); |
|
194 |
||
195 |
return (0); |
|
196 |
} |
|
197 |
||
198 |
static void |
|
199 |
zfs_purge_cases(fmd_hdl_t *hdl) |
|
200 |
{ |
|
201 |
zfs_case_t *zcp; |
|
202 |
uu_list_walk_t *walk; |
|
203 |
libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); |
|
204 |
||
205 |
/* |
|
206 |
* There is no way to open a pool by GUID, or lookup a vdev by GUID. No |
|
207 |
* matter what we do, we're going to have to stomach a O(vdevs * cases) |
|
208 |
* algorithm. In reality, both quantities are likely so small that |
|
209 |
* neither will matter. Given that iterating over pools is more |
|
210 |
* expensive than iterating over the in-memory case list, we opt for a |
|
211 |
* 'present' flag in each case that starts off cleared. We then iterate |
|
212 |
* over all pools, marking those that are still present, and removing |
|
213 |
* those that aren't found. |
|
214 |
* |
|
215 |
* Note that we could also construct an FMRI and rely on |
|
216 |
* fmd_nvl_fmri_present(), but this would end up doing the same search. |
|
217 |
*/ |
|
218 |
||
219 |
/* |
|
220 |
* Mark the cases an not present. |
|
221 |
*/ |
|
222 |
for (zcp = uu_list_first(zfs_cases); zcp != NULL; |
|
223 |
zcp = uu_list_next(zfs_cases, zcp)) |
|
224 |
zcp->zc_present = B_FALSE; |
|
225 |
||
226 |
/* |
|
227 |
* Iterate over all pools and mark the pools and vdevs found. If this |
|
228 |
* fails (most probably because we're out of memory), then don't close |
|
229 |
* any of the cases and we cannot be sure they are accurate. |
|
230 |
*/ |
|
231 |
if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) |
|
232 |
return; |
|
233 |
||
234 |
/* |
|
235 |
* Remove those cases which were not found. |
|
236 |
*/ |
|
237 |
walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); |
|
238 |
while ((zcp = uu_list_walk_next(walk)) != NULL) { |
|
239 |
if (!zcp->zc_present) |
|
240 |
fmd_case_close(hdl, zcp->zc_case); |
|
241 |
} |
|
242 |
uu_list_walk_end(walk); |
|
243 |
} |
|
244 |
||
245 |
/* |
|
246 |
* Construct the name of a serd engine given the pool/vdev GUID and type (io or |
|
247 |
* checksum). |
|
248 |
*/ |
|
249 |
static void |
|
250 |
zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, |
|
251 |
const char *type) |
|
252 |
{ |
|
253 |
(void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", pool_guid, |
|
254 |
vdev_guid, type); |
|
255 |
} |
|
256 |
||
257 |
/* |
|
258 |
* Solve a given ZFS case. This first checks to make sure the diagnosis is |
|
259 |
* still valid, as well as cleaning up any pending timer associated with the |
|
260 |
* case. |
|
261 |
*/ |
|
262 |
static void |
|
263 |
zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, |
|
264 |
boolean_t checkunusable) |
|
265 |
{ |
|
266 |
nvlist_t *detector, *fault; |
|
267 |
boolean_t serialize; |
|
268 |
||
269 |
/* |
|
270 |
* Construct the detector from the case data. The detector is in the |
|
271 |
* ZFS scheme, and is either the pool or the vdev, depending on whether |
|
272 |
* this is a vdev or pool fault. |
|
273 |
*/ |
|
274 |
if (nvlist_alloc(&detector, NV_UNIQUE_NAME, 0) != 0) |
|
275 |
return; |
|
276 |
||
277 |
if (nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0) != 0 || |
|
278 |
nvlist_add_string(detector, FM_FMRI_SCHEME, |
|
279 |
FM_FMRI_SCHEME_ZFS) != 0 || |
|
280 |
nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, |
|
281 |
zcp->zc_data.zc_pool_guid) != 0 || |
|
282 |
(zcp->zc_data.zc_vdev_guid != 0 && |
|
283 |
nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, |
|
284 |
zcp->zc_data.zc_vdev_guid) != 0)) { |
|
285 |
nvlist_free(detector); |
|
286 |
return; |
|
287 |
} |
|
288 |
||
289 |
/* |
|
290 |
* We also want to make sure that the detector (pool or vdev) properly |
|
291 |
* reflects the diagnosed state, when the fault corresponds to internal |
|
292 |
* ZFS state (i.e. not checksum or I/O error-induced). Otherwise, a |
|
293 |
* device which was unavailable early in boot (because the driver/file |
|
294 |
* wasn't available) and is now healthy will be mis-diagnosed. |
|
295 |
*/ |
|
296 |
if (!fmd_nvl_fmri_present(hdl, detector) || |
|
297 |
(checkunusable && !fmd_nvl_fmri_unusable(hdl, detector))) { |
|
298 |
fmd_case_close(hdl, zcp->zc_case); |
|
299 |
nvlist_free(detector); |
|
300 |
return; |
|
301 |
} |
|
302 |
||
303 |
fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, NULL, |
|
304 |
detector); |
|
305 |
fmd_case_add_suspect(hdl, zcp->zc_case, fault); |
|
306 |
fmd_case_solve(hdl, zcp->zc_case); |
|
307 |
||
308 |
serialize = B_FALSE; |
|
309 |
if (zcp->zc_data.zc_has_timer) { |
|
310 |
fmd_timer_remove(hdl, zcp->zc_timer); |
|
311 |
zcp->zc_data.zc_has_timer = 0; |
|
312 |
serialize = B_TRUE; |
|
313 |
} |
|
314 |
if (zcp->zc_data.zc_has_serd_timer) { |
|
315 |
fmd_timer_remove(hdl, zcp->zc_serd_timer); |
|
316 |
zcp->zc_data.zc_has_serd_timer = 0; |
|
317 |
serialize = B_TRUE; |
|
318 |
} |
|
319 |
if (serialize) |
|
320 |
zfs_case_serialize(hdl, zcp); |
|
321 |
||
322 |
nvlist_free(detector); |
|
323 |
} |
|
324 |
||
325 |
/* |
|
326 |
* Main fmd entry point. |
|
327 |
*/ |
|
1544 | 328 |
/*ARGSUSED*/ |
329 |
static void |
|
4451 | 330 |
zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) |
1544 | 331 |
{ |
332 |
zfs_case_t *zcp; |
|
333 |
int32_t pool_state; |
|
334 |
uint64_t ena, pool_guid, vdev_guid; |
|
335 |
nvlist_t *detector; |
|
336 |
boolean_t isresource; |
|
4451 | 337 |
const char *serd; |
1544 | 338 |
|
339 |
isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); |
|
340 |
||
341 |
if (isresource) { |
|
342 |
/* |
|
4451 | 343 |
* For resources, we don't have a normal payload. |
1544 | 344 |
*/ |
345 |
if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, |
|
346 |
&vdev_guid) != 0) |
|
347 |
pool_state = SPA_LOAD_OPEN; |
|
348 |
else |
|
349 |
pool_state = SPA_LOAD_NONE; |
|
350 |
detector = NULL; |
|
351 |
} else { |
|
352 |
(void) nvlist_lookup_nvlist(nvl, |
|
353 |
FM_EREPORT_DETECTOR, &detector); |
|
354 |
(void) nvlist_lookup_int32(nvl, |
|
355 |
FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); |
|
356 |
} |
|
357 |
||
358 |
/* |
|
359 |
* We also ignore all ereports generated during an import of a pool, |
|
360 |
* since the only possible fault (.pool) would result in import failure, |
|
361 |
* and hence no persistent fault. Some day we may want to do something |
|
362 |
* with these ereports, so we continue generating them internally. |
|
363 |
*/ |
|
364 |
if (pool_state == SPA_LOAD_IMPORT) |
|
365 |
return; |
|
366 |
||
367 |
/* |
|
368 |
* Determine if this ereport corresponds to an open case. Cases are |
|
369 |
* indexed by ENA, since ZFS does all the work of chaining together |
|
370 |
* related ereports. |
|
371 |
* |
|
372 |
* We also detect if an ereport corresponds to an open case by context, |
|
373 |
* such as: |
|
374 |
* |
|
375 |
* - An error occurred during an open of a pool with an existing |
|
376 |
* case. |
|
377 |
* |
|
378 |
* - An error occurred for a device which already has an open |
|
379 |
* case. |
|
380 |
*/ |
|
4451 | 381 |
(void) nvlist_lookup_uint64(nvl, |
382 |
FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); |
|
383 |
if (nvlist_lookup_uint64(nvl, |
|
384 |
FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) |
|
385 |
vdev_guid = 0; |
|
386 |
if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) |
|
1588
2eff456ae020
6395695 uninitialized ENA can confuse ZFS diagnosis
eschrock
parents:
1544
diff
changeset
|
387 |
ena = 0; |
1544 | 388 |
|
389 |
for (zcp = uu_list_first(zfs_cases); zcp != NULL; |
|
390 |
zcp = uu_list_next(zfs_cases, zcp)) { |
|
391 |
/* |
|
392 |
* Matches a known ENA. |
|
393 |
*/ |
|
394 |
if (zcp->zc_data.zc_ena == ena) |
|
395 |
break; |
|
396 |
||
397 |
/* |
|
398 |
* Matches a case involving load errors for this same pool. |
|
399 |
*/ |
|
400 |
if (zcp->zc_data.zc_pool_guid == pool_guid && |
|
401 |
zcp->zc_data.zc_pool_state == SPA_LOAD_OPEN && |
|
402 |
pool_state == SPA_LOAD_OPEN) |
|
403 |
break; |
|
404 |
||
405 |
/* |
|
406 |
* Device errors for the same device. |
|
407 |
*/ |
|
408 |
if (vdev_guid != 0 && zcp->zc_data.zc_vdev_guid == vdev_guid) |
|
409 |
break; |
|
410 |
} |
|
411 |
||
412 |
if (zcp == NULL) { |
|
413 |
fmd_case_t *cs; |
|
4451 | 414 |
zfs_case_data_t data = { 0 }; |
1544 | 415 |
|
416 |
/* |
|
417 |
* If this is one of our 'fake' resource ereports, and there is |
|
418 |
* no case open, simply discard it. |
|
419 |
*/ |
|
420 |
if (isresource) |
|
421 |
return; |
|
422 |
||
423 |
/* |
|
424 |
* Open a new case. |
|
425 |
*/ |
|
426 |
cs = fmd_case_open(hdl, NULL); |
|
427 |
||
428 |
/* |
|
429 |
* Initialize the case buffer. To commonize code, we actually |
|
430 |
* create the buffer with existing data, and then call |
|
431 |
* zfs_case_unserialize() to instantiate the in-core structure. |
|
432 |
*/ |
|
433 |
fmd_buf_create(hdl, cs, CASE_DATA, |
|
434 |
sizeof (zfs_case_data_t)); |
|
435 |
||
4451 | 436 |
data.zc_version = CASE_DATA_VERSION_SERD; |
1544 | 437 |
data.zc_ena = ena; |
438 |
data.zc_pool_guid = pool_guid; |
|
439 |
data.zc_vdev_guid = vdev_guid; |
|
440 |
data.zc_pool_state = (int)pool_state; |
|
441 |
||
442 |
fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); |
|
443 |
||
444 |
zcp = zfs_case_unserialize(hdl, cs); |
|
445 |
assert(zcp != NULL); |
|
446 |
} |
|
447 |
||
448 |
if (isresource) { |
|
4451 | 449 |
if (fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.ok")) { |
450 |
/* |
|
451 |
* The 'resource.fs.zfs.ok' event is a special |
|
452 |
* internal-only event that signifies that a pool or |
|
453 |
* device that was previously faulted has now come |
|
454 |
* online (as detected by ZFS). This allows us to close |
|
455 |
* the associated case. |
|
456 |
*/ |
|
457 |
fmd_case_close(hdl, zcp->zc_case); |
|
458 |
} else if (fmd_nvl_class_match(hdl, nvl, |
|
459 |
"resource.fs.zfs.autoreplace")) { |
|
460 |
/* |
|
461 |
* The 'resource.fs.zfs.autoreplace' event indicates |
|
462 |
* that the pool was loaded with the 'autoreplace' |
|
463 |
* property set. In this case, any pending device |
|
464 |
* failures should be ignored, as the asynchronous |
|
465 |
* autoreplace handling will take care of them. |
|
466 |
*/ |
|
467 |
fmd_case_close(hdl, zcp->zc_case); |
|
468 |
} else { |
|
469 |
/* |
|
470 |
* The 'resource.fs.zfs.removed' event indicates that |
|
471 |
* device removal was detected, and the device was |
|
472 |
* closed asynchronously. If this is the case, we |
|
473 |
* assume that any recent I/O errors were due to the |
|
474 |
* device removal, not any fault of the device itself. |
|
475 |
* We reset the SERD engine, and cancel any pending |
|
476 |
* timers. |
|
477 |
*/ |
|
478 |
if (zcp->zc_data.zc_has_serd_timer) { |
|
479 |
fmd_timer_remove(hdl, zcp->zc_serd_timer); |
|
480 |
zcp->zc_data.zc_has_serd_timer = 0; |
|
481 |
zfs_case_serialize(hdl, zcp); |
|
482 |
} |
|
483 |
if (zcp->zc_data.zc_serd_io[0] != '\0') |
|
484 |
fmd_serd_reset(hdl, |
|
485 |
zcp->zc_data.zc_serd_io); |
|
486 |
if (zcp->zc_data.zc_serd_checksum[0] != '\0') |
|
487 |
fmd_serd_reset(hdl, |
|
488 |
zcp->zc_data.zc_serd_checksum); |
|
489 |
} |
|
1544 | 490 |
return; |
491 |
} |
|
492 |
||
493 |
/* |
|
494 |
* Associate the ereport with this case. |
|
495 |
*/ |
|
496 |
fmd_case_add_ereport(hdl, zcp->zc_case, ep); |
|
497 |
||
498 |
/* |
|
499 |
* Don't do anything else if this case is already solved. |
|
500 |
*/ |
|
501 |
if (fmd_case_solved(hdl, zcp->zc_case)) |
|
502 |
return; |
|
503 |
||
504 |
/* |
|
505 |
* Determine if we should solve the case and generate a fault. We solve |
|
506 |
* a case if: |
|
507 |
* |
|
508 |
* a. A pool failed to open (ereport.fs.zfs.pool) |
|
509 |
* b. A device failed to open (ereport.fs.zfs.pool) while a pool |
|
510 |
* was up and running. |
|
511 |
* |
|
512 |
* We may see a series of ereports associated with a pool open, all |
|
513 |
* chained together by the same ENA. If the pool open succeeds, then |
|
514 |
* we'll see no further ereports. To detect when a pool open has |
|
515 |
* succeeded, we associate a timer with the event. When it expires, we |
|
516 |
* close the case. |
|
517 |
*/ |
|
518 |
if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.zpool")) { |
|
519 |
/* |
|
520 |
* Pool level fault. |
|
521 |
*/ |
|
4451 | 522 |
zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); |
1544 | 523 |
} else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*") && |
524 |
pool_state == SPA_LOAD_NONE) { |
|
525 |
/* |
|
4451 | 526 |
* Device fault. Before solving the case, determine if the |
527 |
* device failed during open, and the 'autoreplace' property is |
|
528 |
* set. If this is the case, then we post a sysevent which is |
|
529 |
* picked up by the syseventd module, and any processing is done |
|
530 |
* as needed. |
|
1544 | 531 |
*/ |
4451 | 532 |
zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); |
533 |
} else { |
|
534 |
if (pool_state == SPA_LOAD_OPEN) { |
|
535 |
/* |
|
536 |
* Error incurred during a pool open. Reset the timer |
|
537 |
* associated with this case. |
|
538 |
*/ |
|
539 |
if (zcp->zc_data.zc_has_timer) |
|
540 |
fmd_timer_remove(hdl, zcp->zc_timer); |
|
541 |
zcp->zc_timer = fmd_timer_install(hdl, zcp, NULL, |
|
542 |
zfs_case_timeout); |
|
543 |
if (!zcp->zc_data.zc_has_timer) { |
|
544 |
zcp->zc_data.zc_has_timer = 1; |
|
545 |
zfs_case_serialize(hdl, zcp); |
|
546 |
} |
|
1544 | 547 |
} |
548 |
||
549 |
/* |
|
4451 | 550 |
* If this is a checksum or I/O error, then toss it into the |
551 |
* appropriate SERD engine and check to see if it has fired. |
|
552 |
* Ideally, we want to do something more sophisticated, |
|
553 |
* (persistent errors for a single data block, etc). For now, |
|
554 |
* a single SERD engine is sufficient. |
|
1544 | 555 |
*/ |
4451 | 556 |
serd = NULL; |
557 |
if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.io")) { |
|
558 |
if (zcp->zc_data.zc_serd_io[0] == '\0') { |
|
559 |
zfs_serd_name(zcp->zc_data.zc_serd_io, |
|
560 |
pool_guid, vdev_guid, "io"); |
|
561 |
fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, |
|
562 |
fmd_prop_get_int32(hdl, "io_N"), |
|
563 |
fmd_prop_get_int64(hdl, "io_T")); |
|
564 |
zfs_case_serialize(hdl, zcp); |
|
565 |
} |
|
566 |
serd = zcp->zc_data.zc_serd_io; |
|
567 |
} else if (fmd_nvl_class_match(hdl, nvl, |
|
568 |
"ereport.fs.zfs.checksum")) { |
|
569 |
if (zcp->zc_data.zc_serd_checksum[0] == '\0') { |
|
570 |
zfs_serd_name(zcp->zc_data.zc_serd_checksum, |
|
571 |
pool_guid, vdev_guid, "checksum"); |
|
572 |
fmd_serd_create(hdl, |
|
573 |
zcp->zc_data.zc_serd_checksum, |
|
574 |
fmd_prop_get_int32(hdl, "checksum_N"), |
|
575 |
fmd_prop_get_int64(hdl, "checksum_T")); |
|
576 |
zfs_case_serialize(hdl, zcp); |
|
577 |
} |
|
578 |
serd = zcp->zc_data.zc_serd_checksum; |
|
579 |
} |
|
580 |
||
581 |
/* |
|
582 |
* Because I/O errors may be due to device removal, we postpone |
|
583 |
* any diagnosis until we're sure that we aren't about to |
|
584 |
* receive a 'resource.fs.zfs.removed' event. |
|
585 |
*/ |
|
586 |
if (serd && fmd_serd_record(hdl, serd, ep)) { |
|
587 |
if (zcp->zc_data.zc_has_serd_timer) |
|
588 |
fmd_timer_remove(hdl, zcp->zc_serd_timer); |
|
589 |
zcp->zc_serd_timer = fmd_timer_install(hdl, zcp, NULL, |
|
590 |
zfs_serd_timeout); |
|
591 |
if (!zcp->zc_data.zc_has_serd_timer) { |
|
592 |
zcp->zc_data.zc_has_serd_timer = 1; |
|
593 |
zfs_case_serialize(hdl, zcp); |
|
594 |
} |
|
1544 | 595 |
} |
596 |
} |
|
597 |
} |
|
598 |
||
599 |
/* |
|
4451 | 600 |
* Timeout indicates one of two scenarios: |
601 |
* |
|
602 |
* - The pool had faults but was eventually opened successfully. |
|
603 |
* |
|
604 |
* - We diagnosed an I/O error, and it was not due to device removal (which |
|
605 |
* would cause the timeout to be cancelled). |
|
1544 | 606 |
*/ |
607 |
/* ARGSUSED */ |
|
608 |
static void |
|
4451 | 609 |
zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) |
1544 | 610 |
{ |
611 |
zfs_case_t *zcp = data; |
|
4451 | 612 |
const char *faultname; |
1544 | 613 |
|
4451 | 614 |
if (id == zcp->zc_timer) { |
615 |
zcp->zc_data.zc_has_timer = 0; |
|
616 |
fmd_case_close(hdl, zcp->zc_case); |
|
617 |
} |
|
1544 | 618 |
|
4451 | 619 |
if (id == zcp->zc_serd_timer) { |
620 |
if (zcp->zc_data.zc_serd_io[0] != '\0' && |
|
621 |
fmd_serd_fired(hdl, zcp->zc_data.zc_serd_io)) { |
|
622 |
faultname = "fault.fs.zfs.vdev.io"; |
|
623 |
} else { |
|
624 |
assert(fmd_serd_fired(hdl, |
|
625 |
zcp->zc_data.zc_serd_checksum)); |
|
626 |
faultname = "fault.fs.zfs.vdev.checksum"; |
|
627 |
} |
|
628 |
zfs_case_solve(hdl, zcp, faultname, B_FALSE); |
|
629 |
} |
|
1544 | 630 |
} |
631 |
||
632 |
static void |
|
4451 | 633 |
zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) |
1544 | 634 |
{ |
635 |
zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); |
|
636 |
||
4451 | 637 |
if (zcp->zc_data.zc_serd_checksum[0] != '\0') |
638 |
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); |
|
639 |
if (zcp->zc_data.zc_serd_io[0] != '\0') |
|
640 |
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); |
|
1544 | 641 |
if (zcp->zc_data.zc_has_timer) |
642 |
fmd_timer_remove(hdl, zcp->zc_timer); |
|
4451 | 643 |
if (zcp->zc_data.zc_has_serd_timer) |
644 |
fmd_timer_remove(hdl, zcp->zc_serd_timer); |
|
1544 | 645 |
uu_list_remove(zfs_cases, zcp); |
646 |
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); |
|
647 |
} |
|
648 |
||
4451 | 649 |
/* |
650 |
* We use the fmd gc entry point to look for old cases that no longer apply. |
|
651 |
* This allows us to keep our set of case data small in a long running system. |
|
652 |
*/ |
|
653 |
static void |
|
654 |
zfs_fm_gc(fmd_hdl_t *hdl) |
|
655 |
{ |
|
656 |
zfs_purge_cases(hdl); |
|
657 |
} |
|
658 |
||
1544 | 659 |
static const fmd_hdl_ops_t fmd_ops = { |
4451 | 660 |
zfs_fm_recv, /* fmdo_recv */ |
661 |
zfs_fm_timeout, /* fmdo_timeout */ |
|
662 |
zfs_fm_close, /* fmdo_close */ |
|
1544 | 663 |
NULL, /* fmdo_stats */ |
4451 | 664 |
zfs_fm_gc, /* fmdo_gc */ |
1544 | 665 |
}; |
666 |
||
667 |
static const fmd_prop_t fmd_props[] = { |
|
4451 | 668 |
{ "case_timeout", FMD_TYPE_TIME, "5sec" }, |
669 |
{ "checksum_N", FMD_TYPE_UINT32, "10" }, |
|
670 |
{ "checksum_T", FMD_TYPE_TIME, "10min" }, |
|
671 |
{ "io_N", FMD_TYPE_UINT32, "10" }, |
|
672 |
{ "io_T", FMD_TYPE_TIME, "10min" }, |
|
673 |
{ "serd_timeout", FMD_TYPE_TIME, "5sec" }, |
|
1544 | 674 |
{ NULL, 0, NULL } |
675 |
}; |
|
676 |
||
677 |
static const fmd_hdl_info_t fmd_info = { |
|
678 |
"ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props |
|
679 |
}; |
|
680 |
||
681 |
void |
|
682 |
_fmd_init(fmd_hdl_t *hdl) |
|
683 |
{ |
|
684 |
fmd_case_t *cp; |
|
4451 | 685 |
libzfs_handle_t *zhdl; |
686 |
||
687 |
if ((zhdl = libzfs_init()) == NULL) |
|
688 |
return; |
|
1544 | 689 |
|
690 |
if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", |
|
691 |
sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), |
|
4451 | 692 |
NULL, 0)) == NULL) { |
693 |
libzfs_fini(zhdl); |
|
1544 | 694 |
return; |
4451 | 695 |
} |
1544 | 696 |
|
697 |
if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, 0)) == NULL) { |
|
698 |
uu_list_pool_destroy(zfs_case_pool); |
|
4451 | 699 |
libzfs_fini(zhdl); |
1544 | 700 |
return; |
701 |
} |
|
702 |
||
703 |
if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { |
|
704 |
uu_list_destroy(zfs_cases); |
|
705 |
uu_list_pool_destroy(zfs_case_pool); |
|
4451 | 706 |
libzfs_fini(zhdl); |
1544 | 707 |
return; |
708 |
} |
|
709 |
||
4451 | 710 |
fmd_hdl_setspecific(hdl, zhdl); |
711 |
||
1544 | 712 |
/* |
713 |
* Iterate over all active cases and unserialize the associated buffers, |
|
714 |
* adding them to our list of open cases. |
|
715 |
*/ |
|
716 |
for (cp = fmd_case_next(hdl, NULL); |
|
717 |
cp != NULL; cp = fmd_case_next(hdl, cp)) |
|
718 |
(void) zfs_case_unserialize(hdl, cp); |
|
719 |
||
4451 | 720 |
/* |
721 |
* Clear out any old cases that are no longer valid. |
|
722 |
*/ |
|
723 |
zfs_purge_cases(hdl); |
|
724 |
||
725 |
zfs_case_timeout = fmd_prop_get_int64(hdl, "case_timeout"); |
|
726 |
zfs_serd_timeout = fmd_prop_get_int64(hdl, "serd_timeout"); |
|
1544 | 727 |
} |
728 |
||
729 |
void |
|
730 |
_fmd_fini(fmd_hdl_t *hdl) |
|
731 |
{ |
|
732 |
zfs_case_t *zcp; |
|
733 |
uu_list_walk_t *walk; |
|
4451 | 734 |
libzfs_handle_t *zhdl; |
1544 | 735 |
|
736 |
/* |
|
737 |
* Remove all active cases. |
|
738 |
*/ |
|
739 |
walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); |
|
740 |
while ((zcp = uu_list_walk_next(walk)) != NULL) { |
|
741 |
uu_list_remove(zfs_cases, zcp); |
|
742 |
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); |
|
743 |
} |
|
744 |
uu_list_walk_end(walk); |
|
745 |
||
746 |
uu_list_destroy(zfs_cases); |
|
747 |
uu_list_pool_destroy(zfs_case_pool); |
|
4451 | 748 |
|
749 |
zhdl = fmd_hdl_getspecific(hdl); |
|
750 |
libzfs_fini(zhdl); |
|
1544 | 751 |
} |