|
1 /* |
|
2 * CDDL HEADER START |
|
3 * |
|
4 * The contents of this file are subject to the terms of the |
|
5 * Common Development and Distribution License (the "License"). |
|
6 * You may not use this file except in compliance with the License. |
|
7 * |
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 * or http://www.opensolaris.org/os/licensing. |
|
10 * See the License for the specific language governing permissions |
|
11 * and limitations under the License. |
|
12 * |
|
13 * When distributing Covered Code, include this CDDL HEADER in each |
|
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 * If applicable, add the following below this CDDL HEADER, with the |
|
16 * fields enclosed by brackets "[]" replaced with your own identifying |
|
17 * information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 * |
|
19 * CDDL HEADER END |
|
20 */ |
|
21 /* |
|
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
|
23 */ |
|
24 |
|
25 #include <sys/bpobj.h> |
|
26 #include <sys/zfs_context.h> |
|
27 #include <sys/refcount.h> |
|
28 |
|
29 uint64_t |
|
30 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) |
|
31 { |
|
32 int size; |
|
33 |
|
34 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) |
|
35 size = BPOBJ_SIZE_V0; |
|
36 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) |
|
37 size = BPOBJ_SIZE_V1; |
|
38 else |
|
39 size = sizeof (bpobj_phys_t); |
|
40 |
|
41 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, |
|
42 DMU_OT_BPOBJ_HDR, size, tx)); |
|
43 } |
|
44 |
|
45 void |
|
46 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) |
|
47 { |
|
48 int64_t i; |
|
49 bpobj_t bpo; |
|
50 dmu_object_info_t doi; |
|
51 int epb; |
|
52 dmu_buf_t *dbuf = NULL; |
|
53 |
|
54 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); |
|
55 |
|
56 mutex_enter(&bpo.bpo_lock); |
|
57 |
|
58 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) |
|
59 goto out; |
|
60 |
|
61 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); |
|
62 epb = doi.doi_data_block_size / sizeof (uint64_t); |
|
63 |
|
64 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { |
|
65 uint64_t *objarray; |
|
66 uint64_t offset, blkoff; |
|
67 |
|
68 offset = i * sizeof (uint64_t); |
|
69 blkoff = P2PHASE(i, epb); |
|
70 |
|
71 if (dbuf == NULL || dbuf->db_offset > offset) { |
|
72 if (dbuf) |
|
73 dmu_buf_rele(dbuf, FTAG); |
|
74 VERIFY3U(0, ==, dmu_buf_hold(os, |
|
75 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); |
|
76 } |
|
77 |
|
78 ASSERT3U(offset, >=, dbuf->db_offset); |
|
79 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); |
|
80 |
|
81 objarray = dbuf->db_data; |
|
82 bpobj_free(os, objarray[blkoff], tx); |
|
83 } |
|
84 if (dbuf) { |
|
85 dmu_buf_rele(dbuf, FTAG); |
|
86 dbuf = NULL; |
|
87 } |
|
88 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); |
|
89 |
|
90 out: |
|
91 mutex_exit(&bpo.bpo_lock); |
|
92 bpobj_close(&bpo); |
|
93 |
|
94 VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); |
|
95 } |
|
96 |
|
97 int |
|
98 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) |
|
99 { |
|
100 dmu_object_info_t doi; |
|
101 int err; |
|
102 |
|
103 err = dmu_object_info(os, object, &doi); |
|
104 if (err) |
|
105 return (err); |
|
106 |
|
107 bzero(bpo, sizeof (*bpo)); |
|
108 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); |
|
109 |
|
110 ASSERT(bpo->bpo_dbuf == NULL); |
|
111 ASSERT(bpo->bpo_phys == NULL); |
|
112 ASSERT(object != 0); |
|
113 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); |
|
114 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); |
|
115 |
|
116 bpo->bpo_os = os; |
|
117 bpo->bpo_object = object; |
|
118 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; |
|
119 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); |
|
120 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); |
|
121 |
|
122 err = dmu_bonus_hold(bpo->bpo_os, |
|
123 bpo->bpo_object, bpo, &bpo->bpo_dbuf); |
|
124 if (err) |
|
125 return (err); |
|
126 bpo->bpo_phys = bpo->bpo_dbuf->db_data; |
|
127 return (0); |
|
128 } |
|
129 |
|
130 void |
|
131 bpobj_close(bpobj_t *bpo) |
|
132 { |
|
133 /* Lame workaround for closing a bpobj that was never opened. */ |
|
134 if (bpo->bpo_object == 0) |
|
135 return; |
|
136 |
|
137 dmu_buf_rele(bpo->bpo_dbuf, bpo); |
|
138 if (bpo->bpo_cached_dbuf != NULL) |
|
139 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); |
|
140 bpo->bpo_dbuf = NULL; |
|
141 bpo->bpo_phys = NULL; |
|
142 bpo->bpo_cached_dbuf = NULL; |
|
143 |
|
144 mutex_destroy(&bpo->bpo_lock); |
|
145 } |
|
146 |
|
147 static int |
|
148 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, |
|
149 boolean_t free) |
|
150 { |
|
151 dmu_object_info_t doi; |
|
152 int epb; |
|
153 int64_t i; |
|
154 int err = 0; |
|
155 dmu_buf_t *dbuf = NULL; |
|
156 |
|
157 mutex_enter(&bpo->bpo_lock); |
|
158 |
|
159 if (free) |
|
160 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); |
|
161 |
|
162 for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { |
|
163 blkptr_t *bparray; |
|
164 blkptr_t *bp; |
|
165 uint64_t offset, blkoff; |
|
166 |
|
167 offset = i * sizeof (blkptr_t); |
|
168 blkoff = P2PHASE(i, bpo->bpo_epb); |
|
169 |
|
170 if (dbuf == NULL || dbuf->db_offset > offset) { |
|
171 if (dbuf) |
|
172 dmu_buf_rele(dbuf, FTAG); |
|
173 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, |
|
174 FTAG, &dbuf, 0); |
|
175 if (err) |
|
176 break; |
|
177 } |
|
178 |
|
179 ASSERT3U(offset, >=, dbuf->db_offset); |
|
180 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); |
|
181 |
|
182 bparray = dbuf->db_data; |
|
183 bp = &bparray[blkoff]; |
|
184 err = func(arg, bp, tx); |
|
185 if (err) |
|
186 break; |
|
187 if (free) { |
|
188 bpo->bpo_phys->bpo_bytes -= |
|
189 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); |
|
190 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); |
|
191 if (bpo->bpo_havecomp) { |
|
192 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); |
|
193 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); |
|
194 } |
|
195 bpo->bpo_phys->bpo_num_blkptrs--; |
|
196 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); |
|
197 } |
|
198 } |
|
199 if (dbuf) { |
|
200 dmu_buf_rele(dbuf, FTAG); |
|
201 dbuf = NULL; |
|
202 } |
|
203 if (free) { |
|
204 i++; |
|
205 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, |
|
206 i * sizeof (blkptr_t), -1ULL, tx)); |
|
207 } |
|
208 if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) |
|
209 goto out; |
|
210 |
|
211 ASSERT(bpo->bpo_havecomp); |
|
212 err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); |
|
213 if (err) |
|
214 return (err); |
|
215 epb = doi.doi_data_block_size / sizeof (uint64_t); |
|
216 |
|
217 for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { |
|
218 uint64_t *objarray; |
|
219 uint64_t offset, blkoff; |
|
220 bpobj_t sublist; |
|
221 uint64_t used_before, comp_before, uncomp_before; |
|
222 uint64_t used_after, comp_after, uncomp_after; |
|
223 |
|
224 offset = i * sizeof (uint64_t); |
|
225 blkoff = P2PHASE(i, epb); |
|
226 |
|
227 if (dbuf == NULL || dbuf->db_offset > offset) { |
|
228 if (dbuf) |
|
229 dmu_buf_rele(dbuf, FTAG); |
|
230 err = dmu_buf_hold(bpo->bpo_os, |
|
231 bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); |
|
232 if (err) |
|
233 break; |
|
234 } |
|
235 |
|
236 ASSERT3U(offset, >=, dbuf->db_offset); |
|
237 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); |
|
238 |
|
239 objarray = dbuf->db_data; |
|
240 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); |
|
241 if (err) |
|
242 break; |
|
243 if (free) { |
|
244 err = bpobj_space(&sublist, |
|
245 &used_before, &comp_before, &uncomp_before); |
|
246 if (err) |
|
247 break; |
|
248 } |
|
249 err = bpobj_iterate_impl(&sublist, func, arg, tx, free); |
|
250 if (free) { |
|
251 VERIFY3U(0, ==, bpobj_space(&sublist, |
|
252 &used_after, &comp_after, &uncomp_after)); |
|
253 bpo->bpo_phys->bpo_bytes -= used_before - used_after; |
|
254 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); |
|
255 bpo->bpo_phys->bpo_comp -= comp_before - used_after; |
|
256 bpo->bpo_phys->bpo_uncomp -= |
|
257 uncomp_before - uncomp_after; |
|
258 } |
|
259 |
|
260 bpobj_close(&sublist); |
|
261 if (err) |
|
262 break; |
|
263 if (free) { |
|
264 err = dmu_object_free(bpo->bpo_os, |
|
265 objarray[blkoff], tx); |
|
266 if (err) |
|
267 break; |
|
268 bpo->bpo_phys->bpo_num_subobjs--; |
|
269 ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); |
|
270 } |
|
271 } |
|
272 if (dbuf) { |
|
273 dmu_buf_rele(dbuf, FTAG); |
|
274 dbuf = NULL; |
|
275 } |
|
276 if (free) { |
|
277 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, |
|
278 bpo->bpo_phys->bpo_subobjs, |
|
279 (i + 1) * sizeof (uint64_t), -1ULL, tx)); |
|
280 } |
|
281 |
|
282 out: |
|
283 /* If there are no entries, there should be no bytes. */ |
|
284 ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || |
|
285 (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || |
|
286 bpo->bpo_phys->bpo_bytes == 0); |
|
287 |
|
288 mutex_exit(&bpo->bpo_lock); |
|
289 return (err); |
|
290 } |
|
291 |
|
292 /* |
|
293 * Iterate and remove the entries. If func returns nonzero, iteration |
|
294 * will stop and that entry will not be removed. |
|
295 */ |
|
296 int |
|
297 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) |
|
298 { |
|
299 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); |
|
300 } |
|
301 |
|
302 /* |
|
303 * Iterate the entries. If func returns nonzero, iteration will stop. |
|
304 */ |
|
305 int |
|
306 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) |
|
307 { |
|
308 return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); |
|
309 } |
|
310 |
|
311 void |
|
312 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) |
|
313 { |
|
314 bpobj_t subbpo; |
|
315 uint64_t used, comp, uncomp; |
|
316 |
|
317 ASSERT(bpo->bpo_havesubobj); |
|
318 ASSERT(bpo->bpo_havecomp); |
|
319 |
|
320 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); |
|
321 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); |
|
322 bpobj_close(&subbpo); |
|
323 |
|
324 if (used == 0) { |
|
325 /* No point in having an empty subobj. */ |
|
326 bpobj_free(bpo->bpo_os, subobj, tx); |
|
327 return; |
|
328 } |
|
329 |
|
330 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); |
|
331 if (bpo->bpo_phys->bpo_subobjs == 0) { |
|
332 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, |
|
333 DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); |
|
334 } |
|
335 |
|
336 mutex_enter(&bpo->bpo_lock); |
|
337 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, |
|
338 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), |
|
339 sizeof (subobj), &subobj, tx); |
|
340 bpo->bpo_phys->bpo_num_subobjs++; |
|
341 bpo->bpo_phys->bpo_bytes += used; |
|
342 bpo->bpo_phys->bpo_comp += comp; |
|
343 bpo->bpo_phys->bpo_uncomp += uncomp; |
|
344 mutex_exit(&bpo->bpo_lock); |
|
345 } |
|
346 |
|
347 void |
|
348 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) |
|
349 { |
|
350 blkptr_t stored_bp = *bp; |
|
351 uint64_t offset; |
|
352 int blkoff; |
|
353 blkptr_t *bparray; |
|
354 |
|
355 ASSERT(!BP_IS_HOLE(bp)); |
|
356 |
|
357 /* We never need the fill count. */ |
|
358 stored_bp.blk_fill = 0; |
|
359 |
|
360 /* The bpobj will compress better if we can leave off the checksum */ |
|
361 if (!BP_GET_DEDUP(bp)) |
|
362 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); |
|
363 |
|
364 mutex_enter(&bpo->bpo_lock); |
|
365 |
|
366 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); |
|
367 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); |
|
368 |
|
369 if (bpo->bpo_cached_dbuf == NULL || |
|
370 offset < bpo->bpo_cached_dbuf->db_offset || |
|
371 offset >= bpo->bpo_cached_dbuf->db_offset + |
|
372 bpo->bpo_cached_dbuf->db_size) { |
|
373 if (bpo->bpo_cached_dbuf) |
|
374 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); |
|
375 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, |
|
376 offset, bpo, &bpo->bpo_cached_dbuf, 0)); |
|
377 } |
|
378 |
|
379 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); |
|
380 bparray = bpo->bpo_cached_dbuf->db_data; |
|
381 bparray[blkoff] = stored_bp; |
|
382 |
|
383 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); |
|
384 bpo->bpo_phys->bpo_num_blkptrs++; |
|
385 bpo->bpo_phys->bpo_bytes += |
|
386 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); |
|
387 if (bpo->bpo_havecomp) { |
|
388 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); |
|
389 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); |
|
390 } |
|
391 mutex_exit(&bpo->bpo_lock); |
|
392 } |
|
393 |
|
394 struct space_range_arg { |
|
395 spa_t *spa; |
|
396 uint64_t mintxg; |
|
397 uint64_t maxtxg; |
|
398 uint64_t used; |
|
399 uint64_t comp; |
|
400 uint64_t uncomp; |
|
401 }; |
|
402 |
|
403 /* ARGSUSED */ |
|
404 static int |
|
405 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) |
|
406 { |
|
407 struct space_range_arg *sra = arg; |
|
408 |
|
409 if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { |
|
410 sra->used += bp_get_dsize_sync(sra->spa, bp); |
|
411 sra->comp += BP_GET_PSIZE(bp); |
|
412 sra->uncomp += BP_GET_UCSIZE(bp); |
|
413 } |
|
414 return (0); |
|
415 } |
|
416 |
|
417 int |
|
418 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) |
|
419 { |
|
420 mutex_enter(&bpo->bpo_lock); |
|
421 |
|
422 *usedp = bpo->bpo_phys->bpo_bytes; |
|
423 if (bpo->bpo_havecomp) { |
|
424 *compp = bpo->bpo_phys->bpo_comp; |
|
425 *uncompp = bpo->bpo_phys->bpo_uncomp; |
|
426 mutex_exit(&bpo->bpo_lock); |
|
427 return (0); |
|
428 } else { |
|
429 mutex_exit(&bpo->bpo_lock); |
|
430 return (bpobj_space_range(bpo, 0, UINT64_MAX, |
|
431 usedp, compp, uncompp)); |
|
432 } |
|
433 } |
|
434 |
|
435 /* |
|
436 * Return the amount of space in the bpobj which is: |
|
437 * mintxg < blk_birth <= maxtxg |
|
438 */ |
|
439 int |
|
440 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, |
|
441 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) |
|
442 { |
|
443 struct space_range_arg sra = { 0 }; |
|
444 int err; |
|
445 |
|
446 /* |
|
447 * As an optimization, if they want the whole txg range, just |
|
448 * get bpo_bytes rather than iterating over the bps. |
|
449 */ |
|
450 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) |
|
451 return (bpobj_space(bpo, usedp, compp, uncompp)); |
|
452 |
|
453 sra.spa = dmu_objset_spa(bpo->bpo_os); |
|
454 sra.mintxg = mintxg; |
|
455 sra.maxtxg = maxtxg; |
|
456 |
|
457 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); |
|
458 *usedp = sra.used; |
|
459 *compp = sra.comp; |
|
460 *uncompp = sra.uncomp; |
|
461 return (err); |
|
462 } |