789
|
1 |
/*
|
|
2 |
* CDDL HEADER START
|
|
3 |
*
|
|
4 |
* The contents of this file are subject to the terms of the
|
|
5 |
* Common Development and Distribution License, Version 1.0 only
|
|
6 |
* (the "License"). You may not use this file except in compliance
|
|
7 |
* with the License.
|
|
8 |
*
|
|
9 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
10 |
* or http://www.opensolaris.org/os/licensing.
|
|
11 |
* See the License for the specific language governing permissions
|
|
12 |
* and limitations under the License.
|
|
13 |
*
|
|
14 |
* When distributing Covered Code, include this CDDL HEADER in each
|
|
15 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
16 |
* If applicable, add the following below this CDDL HEADER, with the
|
|
17 |
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
18 |
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
19 |
*
|
|
20 |
* CDDL HEADER END
|
|
21 |
*/
|
|
22 |
/*
|
|
23 |
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
|
|
24 |
* Use is subject to license terms.
|
|
25 |
*/
|
|
26 |
|
|
27 |
#pragma ident "%Z%%M% %I% %E% SMI"
|
|
28 |
|
|
29 |
#include <sys/zfs_context.h>
|
|
30 |
#include <sys/dmu_objset.h>
|
|
31 |
#include <sys/dmu_traverse.h>
|
|
32 |
#include <sys/dsl_dataset.h>
|
|
33 |
#include <sys/dsl_dir.h>
|
|
34 |
#include <sys/dsl_pool.h>
|
|
35 |
#include <sys/dnode.h>
|
|
36 |
#include <sys/spa.h>
|
|
37 |
#include <sys/zio.h>
|
|
38 |
#include <sys/dmu_impl.h>
|
|
39 |
|
|
40 |
#define BP_SPAN_SHIFT(level, width) ((level) * (width))
|
|
41 |
|
|
42 |
#define BP_EQUAL(b1, b2) \
|
|
43 |
(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
|
|
44 |
(b1)->blk_birth == (b2)->blk_birth)
|
|
45 |
|
|
46 |
/*
|
|
47 |
* Compare two bookmarks.
|
|
48 |
*
|
|
49 |
* For ADVANCE_PRE, the visitation order is:
|
|
50 |
*
|
|
51 |
* objset 0, 1, 2, ..., ZB_MAXOBJSET.
|
|
52 |
* object 0, 1, 2, ..., ZB_MAXOBJECT.
|
|
53 |
* blkoff 0, 1, 2, ...
|
|
54 |
* level ZB_MAXLEVEL, ..., 2, 1, 0.
|
|
55 |
*
|
|
56 |
* where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
|
|
57 |
* ordering vector is:
|
|
58 |
*
|
|
59 |
* < objset, object, blkoff, -level >
|
|
60 |
*
|
|
61 |
* For ADVANCE_POST, the starting offsets aren't sequential but ending
|
|
62 |
* offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
|
|
63 |
* The visitation order is:
|
|
64 |
*
|
|
65 |
* objset 1, 2, ..., ZB_MAXOBJSET, 0.
|
|
66 |
* object 1, 2, ..., ZB_MAXOBJECT, 0.
|
|
67 |
* blkoff 1, 2, ...
|
|
68 |
* level 0, 1, 2, ..., ZB_MAXLEVEL.
|
|
69 |
*
|
|
70 |
* and thus a valid ordering vector is:
|
|
71 |
*
|
|
72 |
* < objset - 1, object - 1, blkoff, level >
|
|
73 |
*
|
|
74 |
* Both orderings can be expressed as:
|
|
75 |
*
|
|
76 |
* < objset + bias, object + bias, blkoff, level ^ bias >
|
|
77 |
*
|
|
78 |
* where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
|
|
79 |
* and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
|
|
80 |
*
|
|
81 |
* Special case: an objset's osphys is represented as level -1 of object 0.
|
|
82 |
* It is always either the very first or very last block we visit in an objset.
|
|
83 |
* Therefore, if either bookmark's level is -1, level alone determines order.
|
|
84 |
*/
|
|
85 |
static int
|
|
86 |
compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
|
|
87 |
int advance)
|
|
88 |
{
|
|
89 |
int bias = (advance & ADVANCE_PRE) ? 0 : -1;
|
|
90 |
uint64_t sblkoff, eblkoff;
|
|
91 |
int slevel, elevel, wshift;
|
|
92 |
|
|
93 |
if (szb->zb_objset + bias < ezb->zb_objset + bias)
|
|
94 |
return (-1);
|
|
95 |
|
|
96 |
if (szb->zb_objset + bias > ezb->zb_objset + bias)
|
|
97 |
return (1);
|
|
98 |
|
|
99 |
slevel = szb->zb_level;
|
|
100 |
elevel = ezb->zb_level;
|
|
101 |
|
|
102 |
if ((slevel | elevel) < 0)
|
|
103 |
return ((slevel ^ bias) - (elevel ^ bias));
|
|
104 |
|
|
105 |
if (szb->zb_object + bias < ezb->zb_object + bias)
|
|
106 |
return (-1);
|
|
107 |
|
|
108 |
if (szb->zb_object + bias > ezb->zb_object + bias)
|
|
109 |
return (1);
|
|
110 |
|
|
111 |
if (dnp == NULL)
|
|
112 |
return (0);
|
|
113 |
|
|
114 |
wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
115 |
|
|
116 |
sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
|
|
117 |
eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
|
|
118 |
|
|
119 |
if (sblkoff < eblkoff)
|
|
120 |
return (-1);
|
|
121 |
|
|
122 |
if (sblkoff > eblkoff)
|
|
123 |
return (1);
|
|
124 |
|
|
125 |
return ((elevel ^ bias) - (slevel ^ bias));
|
|
126 |
}
|
|
127 |
|
|
128 |
#define SET_BOOKMARK(zb, objset, object, level, blkid) \
|
|
129 |
{ \
|
|
130 |
(zb)->zb_objset = objset; \
|
|
131 |
(zb)->zb_object = object; \
|
|
132 |
(zb)->zb_level = level; \
|
|
133 |
(zb)->zb_blkid = blkid; \
|
|
134 |
}
|
|
135 |
|
|
136 |
#define SET_BOOKMARK_LB(zb, level, blkid) \
|
|
137 |
{ \
|
|
138 |
(zb)->zb_level = level; \
|
|
139 |
(zb)->zb_blkid = blkid; \
|
|
140 |
}
|
|
141 |
|
|
142 |
static int
|
|
143 |
advance_objset(zseg_t *zseg, uint64_t objset, int advance)
|
|
144 |
{
|
|
145 |
zbookmark_t *zb = &zseg->seg_start;
|
|
146 |
|
|
147 |
if (advance & ADVANCE_PRE) {
|
|
148 |
if (objset >= ZB_MAXOBJSET)
|
|
149 |
return (ERANGE);
|
|
150 |
SET_BOOKMARK(zb, objset, 0, -1, 0);
|
|
151 |
} else {
|
|
152 |
if (objset >= ZB_MAXOBJSET)
|
|
153 |
objset = 0;
|
|
154 |
SET_BOOKMARK(zb, objset, 1, 0, 0);
|
|
155 |
}
|
|
156 |
|
|
157 |
if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
|
|
158 |
return (ERANGE);
|
|
159 |
|
|
160 |
return (EAGAIN);
|
|
161 |
}
|
|
162 |
|
|
163 |
static int
|
|
164 |
advance_object(zseg_t *zseg, uint64_t object, int advance)
|
|
165 |
{
|
|
166 |
zbookmark_t *zb = &zseg->seg_start;
|
|
167 |
|
|
168 |
if (advance & ADVANCE_PRE) {
|
|
169 |
if (object >= ZB_MAXOBJECT) {
|
|
170 |
SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
|
|
171 |
} else {
|
|
172 |
SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
|
|
173 |
}
|
|
174 |
} else {
|
|
175 |
if (zb->zb_object == 0) {
|
|
176 |
SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
|
|
177 |
} else {
|
|
178 |
if (object >= ZB_MAXOBJECT)
|
|
179 |
object = 0;
|
|
180 |
SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
|
|
181 |
}
|
|
182 |
}
|
|
183 |
|
|
184 |
if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
|
|
185 |
return (ERANGE);
|
|
186 |
|
|
187 |
return (EAGAIN);
|
|
188 |
}
|
|
189 |
|
|
190 |
static int
|
|
191 |
advance_from_osphys(zseg_t *zseg, int advance)
|
|
192 |
{
|
|
193 |
zbookmark_t *zb = &zseg->seg_start;
|
|
194 |
|
|
195 |
ASSERT(zb->zb_object == 0);
|
|
196 |
ASSERT(zb->zb_level == -1);
|
|
197 |
ASSERT(zb->zb_blkid == 0);
|
|
198 |
|
|
199 |
if (advance & ADVANCE_PRE) {
|
|
200 |
SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
|
|
201 |
} else {
|
|
202 |
if (zb->zb_objset == 0)
|
|
203 |
return (ERANGE);
|
|
204 |
SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
|
|
205 |
}
|
|
206 |
|
|
207 |
if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
|
|
208 |
return (ERANGE);
|
|
209 |
|
|
210 |
return (EAGAIN);
|
|
211 |
}
|
|
212 |
|
|
213 |
static int
|
|
214 |
advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
|
|
215 |
{
|
|
216 |
zbookmark_t *zb = &zseg->seg_start;
|
|
217 |
int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
218 |
int maxlevel = dnp->dn_nlevels - 1;
|
|
219 |
int level = zb->zb_level;
|
|
220 |
uint64_t blkid = zb->zb_blkid;
|
|
221 |
|
|
222 |
if (advance & ADVANCE_PRE) {
|
|
223 |
if (level > 0 && rc == 0) {
|
|
224 |
level--;
|
|
225 |
blkid <<= wshift;
|
|
226 |
} else {
|
|
227 |
blkid++;
|
|
228 |
|
|
229 |
if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
|
|
230 |
dnp->dn_maxblkid)
|
|
231 |
return (ERANGE);
|
|
232 |
|
|
233 |
while (level < maxlevel) {
|
|
234 |
if (P2PHASE(blkid, 1ULL << wshift))
|
|
235 |
break;
|
|
236 |
blkid >>= wshift;
|
|
237 |
level++;
|
|
238 |
}
|
|
239 |
}
|
|
240 |
} else {
|
|
241 |
if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
|
|
242 |
blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
|
|
243 |
level = 0;
|
|
244 |
} else {
|
|
245 |
blkid >>= wshift;
|
|
246 |
level++;
|
|
247 |
}
|
|
248 |
|
|
249 |
while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
|
|
250 |
dnp->dn_maxblkid) {
|
|
251 |
if (level == maxlevel)
|
|
252 |
return (ERANGE);
|
|
253 |
blkid >>= wshift;
|
|
254 |
level++;
|
|
255 |
}
|
|
256 |
}
|
|
257 |
SET_BOOKMARK_LB(zb, level, blkid);
|
|
258 |
|
|
259 |
if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
|
|
260 |
return (ERANGE);
|
|
261 |
|
|
262 |
return (EAGAIN);
|
|
263 |
}
|
|
264 |
|
|
265 |
static int
|
|
266 |
traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
|
|
267 |
{
|
|
268 |
/*
|
|
269 |
* Before we issue the callback, prune against maxtxg.
|
|
270 |
*
|
|
271 |
* We prune against mintxg before we get here because it's a big win.
|
|
272 |
* If a given block was born in txg 37, then we know that the entire
|
|
273 |
* subtree below that block must have been born in txg 37 or earlier.
|
|
274 |
* We can therefore lop off huge branches of the tree as we go.
|
|
275 |
*
|
|
276 |
* There's no corresponding optimization for maxtxg because knowing
|
|
277 |
* that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
|
|
278 |
* children. In fact, the copy-on-write design of ZFS ensures that
|
|
279 |
* top-level blocks will pretty much always be new.
|
|
280 |
*
|
|
281 |
* Therefore, in the name of simplicity we don't prune against
|
|
282 |
* maxtxg until the last possible moment -- that being right now.
|
|
283 |
*/
|
|
284 |
if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
|
|
285 |
return (0);
|
|
286 |
|
|
287 |
if (bc->bc_errno == 0) {
|
|
288 |
zbookmark_t *zb = &bc->bc_bookmark;
|
|
289 |
zbookmark_t *szb = &zseg->seg_start;
|
|
290 |
zbookmark_t *ezb = &zseg->seg_end;
|
|
291 |
zbookmark_t *lzb = &th->th_lastcb;
|
|
292 |
dnode_phys_t *dnp = bc->bc_dnode;
|
|
293 |
|
|
294 |
/*
|
|
295 |
* Debugging: verify that the order we visit things
|
|
296 |
* agrees with the order defined by compare_bookmark().
|
|
297 |
*/
|
|
298 |
ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
|
|
299 |
ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
|
|
300 |
ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
|
|
301 |
lzb->zb_level == ZB_NO_LEVEL);
|
|
302 |
*lzb = *zb;
|
|
303 |
}
|
|
304 |
|
|
305 |
th->th_callbacks++;
|
|
306 |
return (th->th_func(bc, th->th_spa, th->th_arg));
|
|
307 |
}
|
|
308 |
|
|
309 |
static int
|
|
310 |
traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
|
|
311 |
dnode_phys_t *dnp)
|
|
312 |
{
|
|
313 |
zbookmark_t *zb = &bc->bc_bookmark;
|
|
314 |
int error;
|
|
315 |
|
|
316 |
th->th_hits++;
|
|
317 |
|
|
318 |
bc->bc_dnode = dnp;
|
|
319 |
bc->bc_errno = 0;
|
|
320 |
|
|
321 |
if (BP_EQUAL(&bc->bc_blkptr, bp))
|
|
322 |
return (0);
|
|
323 |
|
|
324 |
bc->bc_blkptr = *bp;
|
|
325 |
|
|
326 |
if (bc->bc_data == NULL)
|
|
327 |
return (0);
|
|
328 |
|
|
329 |
if (BP_IS_HOLE(bp)) {
|
|
330 |
ASSERT(th->th_advance & ADVANCE_HOLES);
|
|
331 |
return (0);
|
|
332 |
}
|
|
333 |
|
|
334 |
if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
|
|
335 |
error = EIO;
|
|
336 |
} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
|
|
337 |
error = 0;
|
|
338 |
th->th_arc_hits++;
|
|
339 |
} else {
|
|
340 |
error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
|
|
341 |
BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
|
|
342 |
th->th_zio_flags | ZIO_FLAG_DONT_CACHE));
|
|
343 |
|
|
344 |
if (BP_SHOULD_BYTESWAP(bp) && error == 0)
|
|
345 |
(zb->zb_level > 0 ? byteswap_uint64_array :
|
|
346 |
dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
|
|
347 |
BP_GET_LSIZE(bp));
|
|
348 |
th->th_reads++;
|
|
349 |
}
|
|
350 |
|
|
351 |
if (error) {
|
|
352 |
bc->bc_errno = error;
|
|
353 |
error = traverse_callback(th, NULL, bc);
|
|
354 |
ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
|
|
355 |
bc->bc_blkptr.blk_birth = -1ULL;
|
|
356 |
}
|
|
357 |
|
|
358 |
dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
|
|
359 |
bc - &th->th_cache[0][0], error,
|
|
360 |
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
|
|
361 |
|
|
362 |
return (error);
|
|
363 |
}
|
|
364 |
|
|
365 |
static int
|
|
366 |
find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
|
|
367 |
{
|
|
368 |
zbookmark_t *zb = &zseg->seg_start;
|
|
369 |
traverse_blk_cache_t *bc;
|
|
370 |
blkptr_t *bp = dnp->dn_blkptr;
|
|
371 |
int i, first, level;
|
|
372 |
int nbp = dnp->dn_nblkptr;
|
|
373 |
int minlevel = zb->zb_level;
|
|
374 |
int maxlevel = dnp->dn_nlevels - 1;
|
|
375 |
int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
|
|
376 |
int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
|
|
377 |
uint64_t blkid = zb->zb_blkid >> bp_shift;
|
|
378 |
int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
|
|
379 |
int rc;
|
|
380 |
|
|
381 |
if (minlevel > maxlevel || blkid >= nbp)
|
|
382 |
return (ERANGE);
|
|
383 |
|
|
384 |
for (level = maxlevel; level >= minlevel; level--) {
|
|
385 |
first = P2PHASE(blkid, 1ULL << wshift);
|
|
386 |
|
|
387 |
for (i = first; i < nbp; i++)
|
|
388 |
if (bp[i].blk_birth > zseg->seg_mintxg ||
|
|
389 |
BP_IS_HOLE(&bp[i]) && do_holes)
|
|
390 |
break;
|
|
391 |
|
|
392 |
if (i != first) {
|
|
393 |
i--;
|
|
394 |
SET_BOOKMARK_LB(zb, level, blkid + (i - first));
|
|
395 |
return (ENOTBLK);
|
|
396 |
}
|
|
397 |
|
|
398 |
bc = &th->th_cache[depth][level];
|
|
399 |
|
|
400 |
SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
|
|
401 |
level, blkid);
|
|
402 |
|
|
403 |
if (rc = traverse_read(th, bc, bp + i, dnp)) {
|
|
404 |
if (rc != EAGAIN) {
|
|
405 |
SET_BOOKMARK_LB(zb, level, blkid);
|
|
406 |
}
|
|
407 |
return (rc);
|
|
408 |
}
|
|
409 |
|
|
410 |
if (BP_IS_HOLE(&bp[i])) {
|
|
411 |
SET_BOOKMARK_LB(zb, level, blkid);
|
|
412 |
th->th_lastcb.zb_level = ZB_NO_LEVEL;
|
|
413 |
return (0);
|
|
414 |
}
|
|
415 |
|
|
416 |
nbp = 1 << wshift;
|
|
417 |
bp = bc->bc_data;
|
|
418 |
bp_shift -= wshift;
|
|
419 |
blkid = zb->zb_blkid >> bp_shift;
|
|
420 |
}
|
|
421 |
|
|
422 |
return (0);
|
|
423 |
}
|
|
424 |
|
|
425 |
static int
|
|
426 |
get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
|
|
427 |
uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
|
|
428 |
{
|
|
429 |
zseg_t zseg;
|
|
430 |
zbookmark_t *zb = &zseg.seg_start;
|
|
431 |
uint64_t object = *objectp;
|
|
432 |
int i, rc;
|
|
433 |
|
|
434 |
SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
|
|
435 |
SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
|
|
436 |
|
|
437 |
zseg.seg_mintxg = txg;
|
|
438 |
zseg.seg_maxtxg = -1ULL;
|
|
439 |
|
|
440 |
for (;;) {
|
|
441 |
rc = find_block(th, &zseg, mdn, depth);
|
|
442 |
|
|
443 |
if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
|
|
444 |
break;
|
|
445 |
|
|
446 |
if (rc == 0 && zb->zb_level == 0) {
|
|
447 |
dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
|
|
448 |
for (i = 0; i < DNODES_PER_BLOCK; i++) {
|
|
449 |
object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
|
|
450 |
if (object >= *objectp &&
|
|
451 |
dnp[i].dn_type != DMU_OT_NONE &&
|
|
452 |
(type == -1 || dnp[i].dn_type == type)) {
|
|
453 |
*objectp = object;
|
|
454 |
*dnpp = &dnp[i];
|
|
455 |
return (0);
|
|
456 |
}
|
|
457 |
}
|
|
458 |
}
|
|
459 |
|
|
460 |
rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
|
|
461 |
|
|
462 |
if (rc == ERANGE)
|
|
463 |
break;
|
|
464 |
}
|
|
465 |
|
|
466 |
if (rc == ERANGE)
|
|
467 |
*objectp = ZB_MAXOBJECT;
|
|
468 |
|
|
469 |
return (rc);
|
|
470 |
}
|
|
471 |
|
|
472 |
static int
|
|
473 |
traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
|
|
474 |
{
|
|
475 |
zbookmark_t *zb = &zseg->seg_start;
|
|
476 |
traverse_blk_cache_t *bc;
|
|
477 |
dnode_phys_t *dn, *dn_tmp;
|
|
478 |
int worklimit = 1000;
|
|
479 |
int rc;
|
|
480 |
|
|
481 |
dprintf("<%llu, %llu, %d, %llx>\n",
|
|
482 |
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
|
|
483 |
|
|
484 |
bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
|
|
485 |
dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
|
|
486 |
|
|
487 |
SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
|
|
488 |
|
|
489 |
rc = traverse_read(th, bc, mosbp, dn);
|
|
490 |
|
|
491 |
if (rc) /* If we get ERESTART, we've got nowhere left to go */
|
|
492 |
return (rc == ERESTART ? EINTR : rc);
|
|
493 |
|
|
494 |
ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
|
|
495 |
|
|
496 |
if (zb->zb_objset != 0) {
|
|
497 |
uint64_t objset = zb->zb_objset;
|
|
498 |
dsl_dataset_phys_t *dsp;
|
|
499 |
|
|
500 |
rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
|
|
501 |
DMU_OT_DSL_OBJSET, ZB_MOS_CACHE);
|
|
502 |
|
|
503 |
if (objset != zb->zb_objset)
|
|
504 |
rc = advance_objset(zseg, objset, th->th_advance);
|
|
505 |
|
|
506 |
if (rc != 0)
|
|
507 |
return (rc);
|
|
508 |
|
|
509 |
dsp = DN_BONUS(dn_tmp);
|
|
510 |
|
|
511 |
bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
|
|
512 |
dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
|
|
513 |
|
|
514 |
SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
|
|
515 |
|
|
516 |
rc = traverse_read(th, bc, &dsp->ds_bp, dn);
|
|
517 |
|
|
518 |
if (rc != 0) {
|
|
519 |
if (rc == ERESTART)
|
|
520 |
rc = advance_objset(zseg, zb->zb_objset + 1,
|
|
521 |
th->th_advance);
|
|
522 |
return (rc);
|
|
523 |
}
|
|
524 |
|
|
525 |
if (th->th_advance & ADVANCE_PRUNE)
|
|
526 |
zseg->seg_mintxg =
|
|
527 |
MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
|
|
528 |
}
|
|
529 |
|
|
530 |
if (zb->zb_level == -1) {
|
|
531 |
ASSERT(zb->zb_object == 0);
|
|
532 |
|
|
533 |
if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
|
|
534 |
rc = traverse_callback(th, zseg, bc);
|
|
535 |
if (rc) {
|
|
536 |
ASSERT(rc == EINTR);
|
|
537 |
return (rc);
|
|
538 |
}
|
|
539 |
}
|
|
540 |
|
|
541 |
return (advance_from_osphys(zseg, th->th_advance));
|
|
542 |
}
|
|
543 |
|
|
544 |
if (zb->zb_object != 0) {
|
|
545 |
uint64_t object = zb->zb_object;
|
|
546 |
|
|
547 |
rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
|
|
548 |
zseg->seg_mintxg, -1, ZB_MDN_CACHE);
|
|
549 |
|
|
550 |
if (object != zb->zb_object)
|
|
551 |
rc = advance_object(zseg, object, th->th_advance);
|
|
552 |
|
|
553 |
if (rc != 0)
|
|
554 |
return (rc);
|
|
555 |
|
|
556 |
dn = dn_tmp;
|
|
557 |
}
|
|
558 |
|
|
559 |
if (zb->zb_level == ZB_MAXLEVEL)
|
|
560 |
zb->zb_level = dn->dn_nlevels - 1;
|
|
561 |
|
|
562 |
for (;;) {
|
|
563 |
rc = find_block(th, zseg, dn, ZB_DN_CACHE);
|
|
564 |
|
|
565 |
if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
|
|
566 |
break;
|
|
567 |
|
|
568 |
if (rc == 0) {
|
|
569 |
bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
|
|
570 |
ASSERT(bc->bc_dnode == dn);
|
|
571 |
ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
|
|
572 |
rc = traverse_callback(th, zseg, bc);
|
|
573 |
if (rc) {
|
|
574 |
ASSERT(rc == EINTR);
|
|
575 |
return (rc);
|
|
576 |
}
|
|
577 |
if (BP_IS_HOLE(&bc->bc_blkptr)) {
|
|
578 |
ASSERT(th->th_advance & ADVANCE_HOLES);
|
|
579 |
rc = ENOTBLK;
|
|
580 |
}
|
|
581 |
}
|
|
582 |
|
|
583 |
rc = advance_block(zseg, dn, rc, th->th_advance);
|
|
584 |
|
|
585 |
if (rc == ERANGE)
|
|
586 |
break;
|
|
587 |
|
|
588 |
/*
|
|
589 |
* Give spa_sync() a chance to run.
|
|
590 |
*/
|
|
591 |
if (spa_traverse_wanted(th->th_spa)) {
|
|
592 |
th->th_syncs++;
|
|
593 |
return (EAGAIN);
|
|
594 |
}
|
|
595 |
|
|
596 |
if (--worklimit == 0)
|
|
597 |
return (EAGAIN);
|
|
598 |
}
|
|
599 |
|
|
600 |
if (rc == ERANGE)
|
|
601 |
rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
|
|
602 |
|
|
603 |
return (rc);
|
|
604 |
}
|
|
605 |
|
|
606 |
/*
|
|
607 |
* It is the caller's responsibility to ensure that the dsl_dataset_t
|
|
608 |
* doesn't go away during traversal.
|
|
609 |
*/
|
|
610 |
int
|
|
611 |
traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
|
|
612 |
blkptr_cb_t func, void *arg)
|
|
613 |
{
|
|
614 |
spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
|
|
615 |
traverse_handle_t *th;
|
|
616 |
int err;
|
|
617 |
|
|
618 |
th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
|
|
619 |
|
|
620 |
traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
|
|
621 |
|
|
622 |
while ((err = traverse_more(th)) == EAGAIN)
|
|
623 |
continue;
|
|
624 |
|
|
625 |
traverse_fini(th);
|
|
626 |
return (err);
|
|
627 |
}
|
|
628 |
|
|
629 |
int
|
|
630 |
traverse_more(traverse_handle_t *th)
|
|
631 |
{
|
|
632 |
zseg_t *zseg = list_head(&th->th_seglist);
|
|
633 |
uint64_t save_txg; /* XXX won't be necessary with real itinerary */
|
|
634 |
krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
|
|
635 |
blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
|
|
636 |
int rc;
|
|
637 |
|
|
638 |
if (zseg == NULL)
|
|
639 |
return (0);
|
|
640 |
|
|
641 |
th->th_restarts++;
|
|
642 |
|
|
643 |
save_txg = zseg->seg_mintxg;
|
|
644 |
|
|
645 |
if (!(th->th_advance & ADVANCE_NOLOCK))
|
|
646 |
rw_enter(rw, RW_READER);
|
|
647 |
|
|
648 |
rc = traverse_segment(th, zseg, mosbp);
|
|
649 |
ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
|
|
650 |
|
|
651 |
if (!(th->th_advance & ADVANCE_NOLOCK))
|
|
652 |
rw_exit(rw);
|
|
653 |
|
|
654 |
zseg->seg_mintxg = save_txg;
|
|
655 |
|
|
656 |
if (rc == ERANGE) {
|
|
657 |
list_remove(&th->th_seglist, zseg);
|
|
658 |
kmem_free(zseg, sizeof (*zseg));
|
|
659 |
return (EAGAIN);
|
|
660 |
}
|
|
661 |
|
|
662 |
return (rc);
|
|
663 |
}
|
|
664 |
|
|
665 |
/*
|
|
666 |
* Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
|
|
667 |
* are not included. The blocks covered by this segment will all have
|
|
668 |
* mintxg < birth < maxtxg.
|
|
669 |
*/
|
|
670 |
static void
|
|
671 |
traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
|
|
672 |
uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
|
|
673 |
uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
|
|
674 |
{
|
|
675 |
zseg_t *zseg;
|
|
676 |
|
|
677 |
zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
|
|
678 |
|
|
679 |
zseg->seg_mintxg = mintxg;
|
|
680 |
zseg->seg_maxtxg = maxtxg;
|
|
681 |
|
|
682 |
zseg->seg_start.zb_objset = sobjset;
|
|
683 |
zseg->seg_start.zb_object = sobject;
|
|
684 |
zseg->seg_start.zb_level = slevel;
|
|
685 |
zseg->seg_start.zb_blkid = sblkid;
|
|
686 |
|
|
687 |
zseg->seg_end.zb_objset = eobjset;
|
|
688 |
zseg->seg_end.zb_object = eobject;
|
|
689 |
zseg->seg_end.zb_level = elevel;
|
|
690 |
zseg->seg_end.zb_blkid = eblkid;
|
|
691 |
|
|
692 |
list_insert_tail(&th->th_seglist, zseg);
|
|
693 |
}
|
|
694 |
|
|
695 |
void
|
|
696 |
traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
|
|
697 |
uint64_t objset, uint64_t object)
|
|
698 |
{
|
|
699 |
if (th->th_advance & ADVANCE_PRE)
|
|
700 |
traverse_add_segment(th, mintxg, maxtxg,
|
|
701 |
objset, object, ZB_MAXLEVEL, 0,
|
|
702 |
objset, object, 0, ZB_MAXBLKID);
|
|
703 |
else
|
|
704 |
traverse_add_segment(th, mintxg, maxtxg,
|
|
705 |
objset, object, 0, 0,
|
|
706 |
objset, object, 0, ZB_MAXBLKID);
|
|
707 |
}
|
|
708 |
|
|
709 |
void
|
|
710 |
traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
|
|
711 |
uint64_t objset)
|
|
712 |
{
|
|
713 |
if (th->th_advance & ADVANCE_PRE)
|
|
714 |
traverse_add_segment(th, mintxg, maxtxg,
|
|
715 |
objset, 0, -1, 0,
|
|
716 |
objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
|
|
717 |
else
|
|
718 |
traverse_add_segment(th, mintxg, maxtxg,
|
|
719 |
objset, 1, 0, 0,
|
|
720 |
objset, 0, -1, 0);
|
|
721 |
}
|
|
722 |
|
|
723 |
void
|
|
724 |
traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
|
|
725 |
{
|
|
726 |
if (th->th_advance & ADVANCE_PRE)
|
|
727 |
traverse_add_segment(th, mintxg, maxtxg,
|
|
728 |
0, 0, -1, 0,
|
|
729 |
ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
|
|
730 |
else
|
|
731 |
traverse_add_segment(th, mintxg, maxtxg,
|
|
732 |
1, 1, 0, 0,
|
|
733 |
0, 0, -1, 0);
|
|
734 |
}
|
|
735 |
|
|
736 |
traverse_handle_t *
|
|
737 |
traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
|
|
738 |
int zio_flags)
|
|
739 |
{
|
|
740 |
traverse_handle_t *th;
|
|
741 |
int d, l;
|
|
742 |
|
|
743 |
th = kmem_zalloc(sizeof (*th), KM_SLEEP);
|
|
744 |
|
|
745 |
th->th_spa = spa;
|
|
746 |
th->th_func = func;
|
|
747 |
th->th_arg = arg;
|
|
748 |
th->th_advance = advance;
|
|
749 |
th->th_lastcb.zb_level = ZB_NO_LEVEL;
|
|
750 |
th->th_noread.zb_level = ZB_NO_LEVEL;
|
|
751 |
th->th_zio_flags = zio_flags;
|
|
752 |
|
|
753 |
list_create(&th->th_seglist, sizeof (zseg_t),
|
|
754 |
offsetof(zseg_t, seg_node));
|
|
755 |
|
|
756 |
for (d = 0; d < ZB_DEPTH; d++) {
|
|
757 |
for (l = 0; l < ZB_MAXLEVEL; l++) {
|
|
758 |
if ((advance & ADVANCE_DATA) ||
|
|
759 |
l != 0 || d != ZB_DN_CACHE)
|
|
760 |
th->th_cache[d][l].bc_data =
|
|
761 |
zio_buf_alloc(SPA_MAXBLOCKSIZE);
|
|
762 |
}
|
|
763 |
}
|
|
764 |
|
|
765 |
return (th);
|
|
766 |
}
|
|
767 |
|
|
768 |
void
|
|
769 |
traverse_fini(traverse_handle_t *th)
|
|
770 |
{
|
|
771 |
int d, l;
|
|
772 |
zseg_t *zseg;
|
|
773 |
|
|
774 |
for (d = 0; d < ZB_DEPTH; d++)
|
|
775 |
for (l = 0; l < ZB_MAXLEVEL; l++)
|
|
776 |
if (th->th_cache[d][l].bc_data != NULL)
|
|
777 |
zio_buf_free(th->th_cache[d][l].bc_data,
|
|
778 |
SPA_MAXBLOCKSIZE);
|
|
779 |
|
|
780 |
while ((zseg = list_head(&th->th_seglist)) != NULL) {
|
|
781 |
list_remove(&th->th_seglist, zseg);
|
|
782 |
kmem_free(zseg, sizeof (*zseg));
|
|
783 |
}
|
|
784 |
|
|
785 |
list_destroy(&th->th_seglist);
|
|
786 |
|
|
787 |
dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
|
|
788 |
th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
|
|
789 |
th->th_syncs, th->th_restarts);
|
|
790 |
|
|
791 |
kmem_free(th, sizeof (*th));
|
|
792 |
}
|