author | Lin Ling <Lin.Ling@Sun.COM> |
Wed, 24 Sep 2008 20:56:23 -0700 | |
changeset 7687 | 1132fbaf0c27 |
parent 7553 | 05e51cba6833 |
child 7754 | b80e4842ad54 |
permissions | -rw-r--r-- |
789 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
1489
fa842259660e
6322205 Enable disk write cache if ZFS owns the disk
webaker
parents:
1171
diff
changeset
|
5 |
* Common Development and Distribution License (the "License"). |
fa842259660e
6322205 Enable disk write cache if ZFS owns the disk
webaker
parents:
1171
diff
changeset
|
6 |
* You may not use this file except in compliance with the License. |
789 | 7 |
* |
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
21 |
/* |
|
6423 | 22 |
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
789 | 23 |
* Use is subject to license terms. |
24 |
*/ |
|
25 |
||
26 |
#include <sys/zfs_context.h> |
|
27 |
#include <sys/spa.h> |
|
6423 | 28 |
#include <sys/refcount.h> |
789 | 29 |
#include <sys/vdev_disk.h> |
30 |
#include <sys/vdev_impl.h> |
|
31 |
#include <sys/fs/zfs.h> |
|
32 |
#include <sys/zio.h> |
|
1171 | 33 |
#include <sys/sunldi.h> |
6976
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
34 |
#include <sys/fm/fs/zfs.h> |
789 | 35 |
|
36 |
/* |
|
37 |
* Virtual device vector for disks. |
|
38 |
*/ |
|
39 |
||
40 |
extern ldi_ident_t zfs_li; |
|
41 |
||
42 |
typedef struct vdev_disk_buf { |
|
43 |
buf_t vdb_buf; |
|
44 |
zio_t *vdb_io; |
|
45 |
} vdev_disk_buf_t; |
|
46 |
||
47 |
static int |
|
5329 | 48 |
vdev_disk_open_common(vdev_t *vd) |
789 | 49 |
{ |
50 |
vdev_disk_t *dvd; |
|
5329 | 51 |
dev_t dev; |
789 | 52 |
int error; |
53 |
||
54 |
/* |
|
55 |
* We must have a pathname, and it must be absolute. |
|
56 |
*/ |
|
57 |
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { |
|
58 |
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; |
|
59 |
return (EINVAL); |
|
60 |
} |
|
61 |
||
62 |
dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); |
|
63 |
||
64 |
/* |
|
65 |
* When opening a disk device, we want to preserve the user's original |
|
66 |
* intent. We always want to open the device by the path the user gave |
|
67 |
* us, even if it is one of multiple paths to the save device. But we |
|
68 |
* also want to be able to survive disks being removed/recabled. |
|
69 |
* Therefore the sequence of opening devices is: |
|
70 |
* |
|
1171 | 71 |
* 1. Try opening the device by path. For legacy pools without the |
72 |
* 'whole_disk' property, attempt to fix the path by appending 's0'. |
|
789 | 73 |
* |
74 |
* 2. If the devid of the device matches the stored value, return |
|
75 |
* success. |
|
76 |
* |
|
77 |
* 3. Otherwise, the device may have moved. Try opening the device |
|
78 |
* by the devid instead. |
|
79 |
* |
|
6673
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
80 |
* If the vdev is part of the root pool, we avoid opening it by path. |
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
81 |
* We do this because there is no /dev path available early in boot, |
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
82 |
* and if we try to open the device by path at a later point, we can |
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
83 |
* deadlock when devfsadm attempts to open the underlying backing store |
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
84 |
* file. |
789 | 85 |
*/ |
86 |
if (vd->vdev_devid != NULL) { |
|
87 |
if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, |
|
88 |
&dvd->vd_minor) != 0) { |
|
89 |
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; |
|
90 |
return (EINVAL); |
|
91 |
} |
|
92 |
} |
|
93 |
||
94 |
error = EINVAL; /* presume failure */ |
|
95 |
||
6673
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
96 |
if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) { |
789 | 97 |
ddi_devid_t devid; |
98 |
||
1171 | 99 |
if (vd->vdev_wholedisk == -1ULL) { |
100 |
size_t len = strlen(vd->vdev_path) + 3; |
|
101 |
char *buf = kmem_alloc(len, KM_SLEEP); |
|
102 |
ldi_handle_t lh; |
|
103 |
||
104 |
(void) snprintf(buf, len, "%ss0", vd->vdev_path); |
|
789 | 105 |
|
1171 | 106 |
if (ldi_open_by_name(buf, spa_mode, kcred, |
107 |
&lh, zfs_li) == 0) { |
|
108 |
spa_strfree(vd->vdev_path); |
|
109 |
vd->vdev_path = buf; |
|
110 |
vd->vdev_wholedisk = 1ULL; |
|
111 |
(void) ldi_close(lh, spa_mode, kcred); |
|
112 |
} else { |
|
113 |
kmem_free(buf, len); |
|
114 |
} |
|
115 |
} |
|
789 | 116 |
|
1171 | 117 |
error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, |
118 |
&dvd->vd_lh, zfs_li); |
|
789 | 119 |
|
120 |
/* |
|
121 |
* Compare the devid to the stored value. |
|
122 |
*/ |
|
123 |
if (error == 0 && vd->vdev_devid != NULL && |
|
124 |
ldi_get_devid(dvd->vd_lh, &devid) == 0) { |
|
125 |
if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { |
|
126 |
error = EINVAL; |
|
127 |
(void) ldi_close(dvd->vd_lh, spa_mode, kcred); |
|
128 |
dvd->vd_lh = NULL; |
|
129 |
} |
|
130 |
ddi_devid_free(devid); |
|
131 |
} |
|
1171 | 132 |
|
133 |
/* |
|
134 |
* If we succeeded in opening the device, but 'vdev_wholedisk' |
|
135 |
* is not yet set, then this must be a slice. |
|
136 |
*/ |
|
137 |
if (error == 0 && vd->vdev_wholedisk == -1ULL) |
|
138 |
vd->vdev_wholedisk = 0; |
|
789 | 139 |
} |
140 |
||
141 |
/* |
|
142 |
* If we were unable to open by path, or the devid check fails, open by |
|
143 |
* devid instead. |
|
144 |
*/ |
|
145 |
if (error != 0 && vd->vdev_devid != NULL) |
|
146 |
error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, |
|
147 |
spa_mode, kcred, &dvd->vd_lh, zfs_li); |
|
148 |
||
4451 | 149 |
/* |
150 |
* If all else fails, then try opening by physical path (if available) |
|
151 |
* or the logical path (if we failed due to the devid check). While not |
|
152 |
* as reliable as the devid, this will give us something, and the higher |
|
153 |
* level vdev validation will prevent us from opening the wrong device. |
|
154 |
*/ |
|
155 |
if (error) { |
|
156 |
if (vd->vdev_physpath != NULL && |
|
157 |
(dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV) |
|
158 |
error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode, |
|
159 |
kcred, &dvd->vd_lh, zfs_li); |
|
160 |
||
161 |
/* |
|
162 |
* Note that we don't support the legacy auto-wholedisk support |
|
163 |
* as above. This hasn't been used in a very long time and we |
|
164 |
* don't need to propagate its oddities to this edge condition. |
|
165 |
*/ |
|
6673
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
166 |
if (error && vd->vdev_path != NULL && |
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
167 |
!spa_is_root(vd->vdev_spa)) |
4451 | 168 |
error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, |
169 |
&dvd->vd_lh, zfs_li); |
|
170 |
} |
|
171 |
||
5329 | 172 |
if (error) |
789 | 173 |
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; |
5329 | 174 |
|
175 |
return (error); |
|
176 |
} |
|
177 |
||
178 |
static int |
|
179 |
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) |
|
180 |
{ |
|
181 |
vdev_disk_t *dvd; |
|
182 |
struct dk_minfo dkm; |
|
183 |
int error; |
|
184 |
dev_t dev; |
|
185 |
int otyp; |
|
186 |
||
187 |
error = vdev_disk_open_common(vd); |
|
188 |
if (error) |
|
789 | 189 |
return (error); |
190 |
||
5329 | 191 |
dvd = vd->vdev_tsd; |
789 | 192 |
/* |
4451 | 193 |
* Once a device is opened, verify that the physical device path (if |
194 |
* available) is up to date. |
|
195 |
*/ |
|
196 |
if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && |
|
197 |
ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { |
|
5329 | 198 |
char *physpath, *minorname; |
199 |
||
4451 | 200 |
physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); |
201 |
minorname = NULL; |
|
202 |
if (ddi_dev_pathname(dev, otyp, physpath) == 0 && |
|
203 |
ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && |
|
204 |
(vd->vdev_physpath == NULL || |
|
205 |
strcmp(vd->vdev_physpath, physpath) != 0)) { |
|
206 |
if (vd->vdev_physpath) |
|
207 |
spa_strfree(vd->vdev_physpath); |
|
208 |
(void) strlcat(physpath, ":", MAXPATHLEN); |
|
209 |
(void) strlcat(physpath, minorname, MAXPATHLEN); |
|
210 |
vd->vdev_physpath = spa_strdup(physpath); |
|
211 |
} |
|
212 |
if (minorname) |
|
213 |
kmem_free(minorname, strlen(minorname) + 1); |
|
214 |
kmem_free(physpath, MAXPATHLEN); |
|
215 |
} |
|
216 |
||
217 |
/* |
|
789 | 218 |
* Determine the actual size of the device. |
219 |
*/ |
|
220 |
if (ldi_get_size(dvd->vd_lh, psize) != 0) { |
|
221 |
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; |
|
222 |
return (EINVAL); |
|
223 |
} |
|
224 |
||
1732 | 225 |
/* |
226 |
* If we own the whole disk, try to enable disk write caching. |
|
227 |
* We ignore errors because it's OK if we can't do it. |
|
228 |
*/ |
|
1489
fa842259660e
6322205 Enable disk write cache if ZFS owns the disk
webaker
parents:
1171
diff
changeset
|
229 |
if (vd->vdev_wholedisk == 1) { |
1732 | 230 |
int wce = 1; |
231 |
(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, |
|
232 |
FKIOCTL, kcred, NULL); |
|
233 |
} |
|
1489
fa842259660e
6322205 Enable disk write cache if ZFS owns the disk
webaker
parents:
1171
diff
changeset
|
234 |
|
1732 | 235 |
/* |
236 |
* Determine the device's minimum transfer size. |
|
237 |
* If the ioctl isn't supported, assume DEV_BSIZE. |
|
238 |
*/ |
|
239 |
if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm, |
|
240 |
FKIOCTL, kcred, NULL) != 0) |
|
241 |
dkm.dki_lbsize = DEV_BSIZE; |
|
1489
fa842259660e
6322205 Enable disk write cache if ZFS owns the disk
webaker
parents:
1171
diff
changeset
|
242 |
|
1732 | 243 |
*ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1; |
1489
fa842259660e
6322205 Enable disk write cache if ZFS owns the disk
webaker
parents:
1171
diff
changeset
|
244 |
|
1773
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
245 |
/* |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
246 |
* Clear the nowritecache bit, so that on a vdev_reopen() we will |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
247 |
* try again. |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
248 |
*/ |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
249 |
vd->vdev_nowritecache = B_FALSE; |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
250 |
|
789 | 251 |
return (0); |
252 |
} |
|
253 |
||
254 |
static void |
|
255 |
vdev_disk_close(vdev_t *vd) |
|
256 |
{ |
|
257 |
vdev_disk_t *dvd = vd->vdev_tsd; |
|
258 |
||
259 |
if (dvd == NULL) |
|
260 |
return; |
|
261 |
||
262 |
if (dvd->vd_minor != NULL) |
|
263 |
ddi_devid_str_free(dvd->vd_minor); |
|
264 |
||
265 |
if (dvd->vd_devid != NULL) |
|
266 |
ddi_devid_free(dvd->vd_devid); |
|
267 |
||
268 |
if (dvd->vd_lh != NULL) |
|
269 |
(void) ldi_close(dvd->vd_lh, spa_mode, kcred); |
|
270 |
||
271 |
kmem_free(dvd, sizeof (vdev_disk_t)); |
|
272 |
vd->vdev_tsd = NULL; |
|
273 |
} |
|
274 |
||
6423 | 275 |
int |
276 |
vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, |
|
277 |
uint64_t offset, int flags) |
|
278 |
{ |
|
279 |
buf_t *bp; |
|
280 |
int error = 0; |
|
281 |
||
282 |
if (vd_lh == NULL) |
|
283 |
return (EINVAL); |
|
284 |
||
285 |
ASSERT(flags & B_READ || flags & B_WRITE); |
|
286 |
||
287 |
bp = getrbuf(KM_SLEEP); |
|
288 |
bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; |
|
289 |
bp->b_bcount = size; |
|
290 |
bp->b_un.b_addr = (void *)data; |
|
291 |
bp->b_lblkno = lbtodb(offset); |
|
292 |
bp->b_bufsize = size; |
|
293 |
||
294 |
error = ldi_strategy(vd_lh, bp); |
|
295 |
ASSERT(error == 0); |
|
296 |
if ((error = biowait(bp)) == 0 && bp->b_resid != 0) |
|
297 |
error = EIO; |
|
298 |
freerbuf(bp); |
|
299 |
||
300 |
return (error); |
|
301 |
} |
|
302 |
||
5329 | 303 |
static int |
304 |
vdev_disk_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, |
|
305 |
int flags) |
|
306 |
{ |
|
307 |
int error = 0; |
|
6523
c1d2a7f04573
6616739 panic message ZFS: I/O failure (write on <unknown> is not very helpful
ek110237
parents:
6423
diff
changeset
|
308 |
vdev_disk_t *dvd = vd ? vd->vdev_tsd : NULL; |
5329 | 309 |
|
310 |
if (vd == NULL || dvd == NULL || dvd->vd_lh == NULL) |
|
311 |
return (EINVAL); |
|
312 |
||
6423 | 313 |
error = vdev_disk_physio(dvd->vd_lh, data, size, offset, flags); |
5329 | 314 |
|
315 |
if (zio_injection_enabled && error == 0) |
|
316 |
error = zio_handle_device_injection(vd, EIO); |
|
317 |
||
318 |
return (error); |
|
319 |
} |
|
320 |
||
5369
27c1235ef9a4
6621355 panic in vdev_disk_io_start when trying to write to a faulted device
gw25295
parents:
5329
diff
changeset
|
321 |
/* |
27c1235ef9a4
6621355 panic in vdev_disk_io_start when trying to write to a faulted device
gw25295
parents:
5329
diff
changeset
|
322 |
* Determine if the underlying device is accessible by reading and writing |
27c1235ef9a4
6621355 panic in vdev_disk_io_start when trying to write to a faulted device
gw25295
parents:
5329
diff
changeset
|
323 |
* to a known location. We must be able to do this during syncing context |
27c1235ef9a4
6621355 panic in vdev_disk_io_start when trying to write to a faulted device
gw25295
parents:
5329
diff
changeset
|
324 |
* and thus we cannot set the vdev state directly. |
27c1235ef9a4
6621355 panic in vdev_disk_io_start when trying to write to a faulted device
gw25295
parents:
5329
diff
changeset
|
325 |
*/ |
5329 | 326 |
static int |
327 |
vdev_disk_probe(vdev_t *vd) |
|
328 |
{ |
|
329 |
uint64_t offset; |
|
330 |
vdev_t *nvd; |
|
331 |
int l, error = 0, retries = 0; |
|
332 |
char *vl_pad; |
|
333 |
||
334 |
if (vd == NULL) |
|
335 |
return (EINVAL); |
|
336 |
||
337 |
/* Hijack the current vdev */ |
|
338 |
nvd = vd; |
|
339 |
||
340 |
/* |
|
341 |
* Pick a random label to rewrite. |
|
342 |
*/ |
|
343 |
l = spa_get_random(VDEV_LABELS); |
|
344 |
ASSERT(l < VDEV_LABELS); |
|
345 |
||
346 |
offset = vdev_label_offset(vd->vdev_psize, l, |
|
347 |
offsetof(vdev_label_t, vl_pad)); |
|
348 |
||
349 |
vl_pad = kmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP); |
|
350 |
||
351 |
/* |
|
352 |
* Try to read and write to a special location on the |
|
353 |
* label. We use the existing vdev initially and only |
|
354 |
* try to create and reopen it if we encounter a failure. |
|
355 |
*/ |
|
356 |
while ((error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE, |
|
357 |
offset, B_READ)) != 0 && retries == 0) { |
|
358 |
||
359 |
nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); |
|
360 |
if (vd->vdev_path) |
|
361 |
nvd->vdev_path = spa_strdup(vd->vdev_path); |
|
362 |
if (vd->vdev_physpath) |
|
363 |
nvd->vdev_physpath = spa_strdup(vd->vdev_physpath); |
|
364 |
if (vd->vdev_devid) |
|
365 |
nvd->vdev_devid = spa_strdup(vd->vdev_devid); |
|
366 |
nvd->vdev_wholedisk = vd->vdev_wholedisk; |
|
367 |
nvd->vdev_guid = vd->vdev_guid; |
|
6673
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
368 |
nvd->vdev_spa = vd->vdev_spa; |
5329 | 369 |
retries++; |
370 |
||
371 |
error = vdev_disk_open_common(nvd); |
|
5369
27c1235ef9a4
6621355 panic in vdev_disk_io_start when trying to write to a faulted device
gw25295
parents:
5329
diff
changeset
|
372 |
if (error) |
5329 | 373 |
break; |
374 |
} |
|
375 |
||
376 |
if (!error) { |
|
377 |
error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE, |
|
378 |
offset, B_WRITE); |
|
379 |
} |
|
380 |
||
381 |
/* Clean up if we allocated a new vdev */ |
|
382 |
if (retries) { |
|
383 |
vdev_disk_close(nvd); |
|
384 |
if (nvd->vdev_path) |
|
385 |
spa_strfree(nvd->vdev_path); |
|
386 |
if (nvd->vdev_physpath) |
|
387 |
spa_strfree(nvd->vdev_physpath); |
|
388 |
if (nvd->vdev_devid) |
|
389 |
spa_strfree(nvd->vdev_devid); |
|
390 |
kmem_free(nvd, sizeof (vdev_t)); |
|
391 |
} |
|
392 |
kmem_free(vl_pad, VDEV_SKIP_SIZE); |
|
393 |
||
394 |
/* Reset the failing flag */ |
|
395 |
if (!error) |
|
396 |
vd->vdev_is_failing = B_FALSE; |
|
397 |
||
398 |
return (error); |
|
399 |
} |
|
400 |
||
789 | 401 |
static void |
402 |
vdev_disk_io_intr(buf_t *bp) |
|
403 |
{ |
|
404 |
vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; |
|
405 |
zio_t *zio = vdb->vdb_io; |
|
406 |
||
6976
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
407 |
/* |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
408 |
* The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
409 |
* Rather than teach the rest of the stack about other error |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
410 |
* possibilities (EFAULT, etc), we normalize the error value here. |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
411 |
*/ |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
412 |
zio->io_error = (geterror(bp) != 0 ? EIO : 0); |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
413 |
|
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
414 |
if (zio->io_error == 0 && bp->b_resid != 0) |
789 | 415 |
zio->io_error = EIO; |
416 |
||
417 |
kmem_free(vdb, sizeof (vdev_disk_buf_t)); |
|
418 |
||
5530 | 419 |
zio_interrupt(zio); |
789 | 420 |
} |
421 |
||
422 |
static void |
|
423 |
vdev_disk_ioctl_done(void *zio_arg, int error) |
|
424 |
{ |
|
425 |
zio_t *zio = zio_arg; |
|
426 |
||
427 |
zio->io_error = error; |
|
428 |
||
5530 | 429 |
zio_interrupt(zio); |
789 | 430 |
} |
431 |
||
5530 | 432 |
static int |
789 | 433 |
vdev_disk_io_start(zio_t *zio) |
434 |
{ |
|
435 |
vdev_t *vd = zio->io_vd; |
|
436 |
vdev_disk_t *dvd = vd->vdev_tsd; |
|
437 |
vdev_disk_buf_t *vdb; |
|
438 |
buf_t *bp; |
|
439 |
int flags, error; |
|
440 |
||
441 |
if (zio->io_type == ZIO_TYPE_IOCTL) { |
|
442 |
zio_vdev_io_bypass(zio); |
|
443 |
||
444 |
/* XXPOLICY */ |
|
5329 | 445 |
if (!vdev_readable(vd)) { |
789 | 446 |
zio->io_error = ENXIO; |
5530 | 447 |
return (ZIO_PIPELINE_CONTINUE); |
789 | 448 |
} |
449 |
||
450 |
switch (zio->io_cmd) { |
|
451 |
||
452 |
case DKIOCFLUSHWRITECACHE: |
|
453 |
||
2885 | 454 |
if (zfs_nocacheflush) |
455 |
break; |
|
456 |
||
1773
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
457 |
if (vd->vdev_nowritecache) { |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
458 |
zio->io_error = ENOTSUP; |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
459 |
break; |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
460 |
} |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
461 |
|
789 | 462 |
zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; |
5065
fcf530c3356e
PSARC 2007/053 Per-Disk-Device support of non-volatile cache
gz161490
parents:
4455
diff
changeset
|
463 |
zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE; |
789 | 464 |
zio->io_dk_callback.dkc_cookie = zio; |
465 |
||
466 |
error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, |
|
467 |
(uintptr_t)&zio->io_dk_callback, |
|
468 |
FKIOCTL, kcred, NULL); |
|
469 |
||
470 |
if (error == 0) { |
|
471 |
/* |
|
472 |
* The ioctl will be done asychronously, |
|
473 |
* and will call vdev_disk_ioctl_done() |
|
474 |
* upon completion. |
|
475 |
*/ |
|
5530 | 476 |
return (ZIO_PIPELINE_STOP); |
477 |
} |
|
478 |
||
479 |
if (error == ENOTSUP || error == ENOTTY) { |
|
1773
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
480 |
/* |
4455
122ee9c0f54c
6456223 system hang after test suite run on usb disk
mishra
parents:
4451
diff
changeset
|
481 |
* If we get ENOTSUP or ENOTTY, we know that |
122ee9c0f54c
6456223 system hang after test suite run on usb disk
mishra
parents:
4451
diff
changeset
|
482 |
* no future attempts will ever succeed. |
122ee9c0f54c
6456223 system hang after test suite run on usb disk
mishra
parents:
4451
diff
changeset
|
483 |
* In this case we set a persistent bit so |
122ee9c0f54c
6456223 system hang after test suite run on usb disk
mishra
parents:
4451
diff
changeset
|
484 |
* that we don't bother with the ioctl in the |
122ee9c0f54c
6456223 system hang after test suite run on usb disk
mishra
parents:
4451
diff
changeset
|
485 |
* future. |
1773
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
486 |
*/ |
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
487 |
vd->vdev_nowritecache = B_TRUE; |
789 | 488 |
} |
489 |
zio->io_error = error; |
|
1773
d6e4f2855c14
6407791 bringover into ZFS results in s. files newer than extracted source
eschrock
parents:
1732
diff
changeset
|
490 |
|
789 | 491 |
break; |
492 |
||
493 |
default: |
|
494 |
zio->io_error = ENOTSUP; |
|
495 |
} |
|
496 |
||
5530 | 497 |
return (ZIO_PIPELINE_CONTINUE); |
789 | 498 |
} |
499 |
||
500 |
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) |
|
5530 | 501 |
return (ZIO_PIPELINE_STOP); |
789 | 502 |
|
503 |
if ((zio = vdev_queue_io(zio)) == NULL) |
|
5530 | 504 |
return (ZIO_PIPELINE_STOP); |
505 |
||
506 |
if (zio->io_type == ZIO_TYPE_WRITE) |
|
507 |
error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; |
|
508 |
else |
|
509 |
error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; |
|
510 |
error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; |
|
511 |
||
512 |
if (error) { |
|
513 |
zio->io_error = error; |
|
514 |
zio_interrupt(zio); |
|
515 |
return (ZIO_PIPELINE_STOP); |
|
516 |
} |
|
789 | 517 |
|
518 |
flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); |
|
519 |
flags |= B_BUSY | B_NOCACHE; |
|
520 |
if (zio->io_flags & ZIO_FLAG_FAILFAST) |
|
521 |
flags |= B_FAILFAST; |
|
522 |
||
523 |
vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); |
|
524 |
||
525 |
vdb->vdb_io = zio; |
|
526 |
bp = &vdb->vdb_buf; |
|
527 |
||
528 |
bioinit(bp); |
|
529 |
bp->b_flags = flags; |
|
530 |
bp->b_bcount = zio->io_size; |
|
531 |
bp->b_un.b_addr = zio->io_data; |
|
532 |
bp->b_lblkno = lbtodb(zio->io_offset); |
|
533 |
bp->b_bufsize = zio->io_size; |
|
534 |
bp->b_iodone = (int (*)())vdev_disk_io_intr; |
|
535 |
||
536 |
error = ldi_strategy(dvd->vd_lh, bp); |
|
537 |
/* ldi_strategy() will return non-zero only on programming errors */ |
|
538 |
ASSERT(error == 0); |
|
5530 | 539 |
|
540 |
return (ZIO_PIPELINE_STOP); |
|
789 | 541 |
} |
542 |
||
5530 | 543 |
static int |
789 | 544 |
vdev_disk_io_done(zio_t *zio) |
545 |
{ |
|
546 |
vdev_queue_io_done(zio); |
|
547 |
||
548 |
if (zio->io_type == ZIO_TYPE_WRITE) |
|
549 |
vdev_cache_write(zio); |
|
550 |
||
1544 | 551 |
if (zio_injection_enabled && zio->io_error == 0) |
552 |
zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); |
|
553 |
||
4451 | 554 |
/* |
555 |
* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if |
|
556 |
* the device has been removed. If this is the case, then we trigger an |
|
5329 | 557 |
* asynchronous removal of the device. Otherwise, probe the device and |
5369
27c1235ef9a4
6621355 panic in vdev_disk_io_start when trying to write to a faulted device
gw25295
parents:
5329
diff
changeset
|
558 |
* make sure it's still accessible. |
4451 | 559 |
*/ |
560 |
if (zio->io_error == EIO) { |
|
5329 | 561 |
vdev_t *vd = zio->io_vd; |
562 |
vdev_disk_t *dvd = vd->vdev_tsd; |
|
563 |
int state; |
|
564 |
||
4451 | 565 |
state = DKIO_NONE; |
5329 | 566 |
if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, |
4451 | 567 |
FKIOCTL, kcred, NULL) == 0 && |
568 |
state != DKIO_INSERTED) { |
|
569 |
vd->vdev_remove_wanted = B_TRUE; |
|
570 |
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); |
|
5329 | 571 |
} else if (vdev_probe(vd) != 0) { |
572 |
ASSERT(vd->vdev_ops->vdev_op_leaf); |
|
6976
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
573 |
if (!vd->vdev_is_failing) { |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
574 |
vd->vdev_is_failing = B_TRUE; |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
575 |
zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
576 |
vd->vdev_spa, vd, zio, 0, 0); |
cae5f06df471
PSARC 2008/388 Short circuit for vdev probe failure
eschrock
parents:
6673
diff
changeset
|
577 |
} |
4451 | 578 |
} |
579 |
} |
|
580 |
||
6615
333cfc13ec55
6616599 vdev_config_sync(rvd, txg) == 0, file: ../../common/fs/zfs/spa .c, line: 3537
gw25295
parents:
6523
diff
changeset
|
581 |
if (zio_injection_enabled && zio->io_error == 0) |
333cfc13ec55
6616599 vdev_config_sync(rvd, txg) == 0, file: ../../common/fs/zfs/spa .c, line: 3537
gw25295
parents:
6523
diff
changeset
|
582 |
zio->io_error = zio_handle_label_injection(zio, EIO); |
333cfc13ec55
6616599 vdev_config_sync(rvd, txg) == 0, file: ../../common/fs/zfs/spa .c, line: 3537
gw25295
parents:
6523
diff
changeset
|
583 |
|
5530 | 584 |
return (ZIO_PIPELINE_CONTINUE); |
789 | 585 |
} |
586 |
||
587 |
vdev_ops_t vdev_disk_ops = { |
|
588 |
vdev_disk_open, |
|
589 |
vdev_disk_close, |
|
5329 | 590 |
vdev_disk_probe, |
789 | 591 |
vdev_default_asize, |
592 |
vdev_disk_io_start, |
|
593 |
vdev_disk_io_done, |
|
594 |
NULL, |
|
595 |
VDEV_TYPE_DISK, /* name of this vdev type */ |
|
596 |
B_TRUE /* leaf vdev */ |
|
597 |
}; |
|
6423 | 598 |
|
599 |
/* |
|
7147
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
600 |
* Given the root disk device devid or pathname, read the label from |
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
601 |
* the device, and construct a configuration nvlist. |
6423 | 602 |
*/ |
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
603 |
int |
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
604 |
vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) |
6423 | 605 |
{ |
606 |
ldi_handle_t vd_lh; |
|
607 |
vdev_label_t *label; |
|
608 |
uint64_t s, size; |
|
609 |
int l; |
|
7147
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
610 |
ddi_devid_t tmpdevid; |
7687
1132fbaf0c27
6750475 can't boot off a mirrored root with a failed disk
Lin Ling <Lin.Ling@Sun.COM>
parents:
7553
diff
changeset
|
611 |
int error = -1; |
7147
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
612 |
char *minor_name; |
6423 | 613 |
|
614 |
/* |
|
615 |
* Read the device label and build the nvlist. |
|
616 |
*/ |
|
7687
1132fbaf0c27
6750475 can't boot off a mirrored root with a failed disk
Lin Ling <Lin.Ling@Sun.COM>
parents:
7553
diff
changeset
|
617 |
if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, |
7147
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
618 |
&minor_name) == 0) { |
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
619 |
error = ldi_open_by_devid(tmpdevid, minor_name, |
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
620 |
spa_mode, kcred, &vd_lh, zfs_li); |
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
621 |
ddi_devid_free(tmpdevid); |
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
622 |
ddi_devid_str_free(minor_name); |
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
623 |
} |
1e1d75c88283
6704717 ZFS mirrored root doesn't live up to expectations
taylor
parents:
6976
diff
changeset
|
624 |
|
7687
1132fbaf0c27
6750475 can't boot off a mirrored root with a failed disk
Lin Ling <Lin.Ling@Sun.COM>
parents:
7553
diff
changeset
|
625 |
if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, |
1132fbaf0c27
6750475 can't boot off a mirrored root with a failed disk
Lin Ling <Lin.Ling@Sun.COM>
parents:
7553
diff
changeset
|
626 |
zfs_li))) |
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
627 |
return (error); |
6423 | 628 |
|
6673
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
629 |
if (ldi_get_size(vd_lh, &s)) { |
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
630 |
(void) ldi_close(vd_lh, FREAD, kcred); |
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
631 |
return (EIO); |
6673
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
632 |
} |
6423 | 633 |
|
634 |
size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); |
|
635 |
label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); |
|
636 |
||
637 |
for (l = 0; l < VDEV_LABELS; l++) { |
|
638 |
uint64_t offset, state, txg = 0; |
|
639 |
||
640 |
/* read vdev label */ |
|
641 |
offset = vdev_label_offset(size, l, 0); |
|
642 |
if (vdev_disk_physio(vd_lh, (caddr_t)label, |
|
643 |
VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE + |
|
644 |
VDEV_PHYS_SIZE, offset, B_READ) != 0) |
|
645 |
continue; |
|
646 |
||
647 |
if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, |
|
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
648 |
sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { |
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
649 |
*config = NULL; |
6423 | 650 |
continue; |
651 |
} |
|
652 |
||
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
653 |
if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, |
6423 | 654 |
&state) != 0 || state >= POOL_STATE_DESTROYED) { |
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
655 |
nvlist_free(*config); |
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
656 |
*config = NULL; |
6423 | 657 |
continue; |
658 |
} |
|
659 |
||
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
660 |
if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, |
6423 | 661 |
&txg) != 0 || txg == 0) { |
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
662 |
nvlist_free(*config); |
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
663 |
*config = NULL; |
6423 | 664 |
continue; |
665 |
} |
|
666 |
||
667 |
break; |
|
668 |
} |
|
669 |
||
670 |
kmem_free(label, sizeof (vdev_label_t)); |
|
6673
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
671 |
(void) ldi_close(vd_lh, FREAD, kcred); |
be079d6124af
6697301 deadlock between ZFS and devfs can hang system
eschrock
parents:
6615
diff
changeset
|
672 |
|
7539
e3f4b4b9f982
6724326 better "can't mountroot" message
Lin Ling <Lin.Ling@Sun.COM>
parents:
7147
diff
changeset
|
673 |
return (error); |
6423 | 674 |
} |