author | gw25295 |
Wed, 08 Aug 2007 15:35:58 -0700 | |
changeset 4831 | 41ec732c6d9f |
parent 3755 | 8708c35cb823 |
child 8636 | 7e4ce9158df3 |
permissions | -rw-r--r-- |
1669 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
5 |
* Common Development and Distribution License (the "License"). |
|
6 |
* You may not use this file except in compliance with the License. |
|
7 |
* |
|
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
21 |
/* |
|
3755
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
22 |
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. |
1669 | 23 |
* Use is subject to license terms. |
24 |
*/ |
|
25 |
||
26 |
#pragma ident "%Z%%M% %I% %E% SMI" |
|
27 |
||
28 |
/* |
|
29 |
* This file contains the code to implement file range locking in |
|
30 |
* ZFS, although there isn't much specific to ZFS (all that comes to mind |
|
31 |
* support for growing the blocksize). |
|
32 |
* |
|
33 |
* Interface |
|
34 |
* --------- |
|
35 |
* Defined in zfs_rlock.h but essentially: |
|
36 |
* rl = zfs_range_lock(zp, off, len, lock_type); |
|
2237 | 37 |
* zfs_range_unlock(rl); |
38 |
* zfs_range_reduce(rl, off, len); |
|
1669 | 39 |
* |
40 |
* AVL tree |
|
41 |
* -------- |
|
42 |
* An AVL tree is used to maintain the state of the existing ranges |
|
43 |
* that are locked for exclusive (writer) or shared (reader) use. |
|
44 |
* The starting range offset is used for searching and sorting the tree. |
|
45 |
* |
|
46 |
* Common case |
|
47 |
* ----------- |
|
48 |
* The (hopefully) usual case is of no overlaps or contention for |
|
49 |
* locks. On entry to zfs_lock_range() a rl_t is allocated; the tree |
|
50 |
* searched that finds no overlap, and *this* rl_t is placed in the tree. |
|
51 |
* |
|
52 |
* Overlaps/Reference counting/Proxy locks |
|
53 |
* --------------------------------------- |
|
54 |
* The avl code only allows one node at a particular offset. Also it's very |
|
55 |
* inefficient to search through all previous entries looking for overlaps |
|
56 |
* (because the very 1st in the ordered list might be at offset 0 but |
|
57 |
* cover the whole file). |
|
58 |
* So this implementation uses reference counts and proxy range locks. |
|
59 |
* Firstly, only reader locks use reference counts and proxy locks, |
|
60 |
* because writer locks are exclusive. |
|
61 |
* When a reader lock overlaps with another then a proxy lock is created |
|
62 |
* for that range and replaces the original lock. If the overlap |
|
63 |
* is exact then the reference count of the proxy is simply incremented. |
|
64 |
* Otherwise, the proxy lock is split into smaller lock ranges and |
|
65 |
* new proxy locks created for non overlapping ranges. |
|
66 |
* The reference counts are adjusted accordingly. |
|
67 |
* Meanwhile, the orginal lock is kept around (this is the callers handle) |
|
68 |
* and its offset and length are used when releasing the lock. |
|
69 |
* |
|
70 |
* Thread coordination |
|
71 |
* ------------------- |
|
72 |
* In order to make wakeups efficient and to ensure multiple continuous |
|
73 |
* readers on a range don't starve a writer for the same range lock, |
|
74 |
* two condition variables are allocated in each rl_t. |
|
75 |
* If a writer (or reader) can't get a range it initialises the writer |
|
76 |
* (or reader) cv; sets a flag saying there's a writer (or reader) waiting; |
|
77 |
* and waits on that cv. When a thread unlocks that range it wakes up all |
|
78 |
* writers then all readers before destroying the lock. |
|
79 |
* |
|
80 |
* Append mode writes |
|
81 |
* ------------------ |
|
82 |
* Append mode writes need to lock a range at the end of a file. |
|
83 |
* The offset of the end of the file is determined under the |
|
84 |
* range locking mutex, and the lock type converted from RL_APPEND to |
|
85 |
* RL_WRITER and the range locked. |
|
86 |
* |
|
87 |
* Grow block handling |
|
88 |
* ------------------- |
|
89 |
* ZFS supports multiple block sizes currently upto 128K. The smallest |
|
90 |
* block size is used for the file which is grown as needed. During this |
|
91 |
* growth all other writers and readers must be excluded. |
|
92 |
* So if the block size needs to be grown then the whole file is |
|
93 |
* exclusively locked, then later the caller will reduce the lock |
|
94 |
* range to just the range to be written using zfs_reduce_range. |
|
95 |
*/ |
|
96 |
||
97 |
#include <sys/zfs_rlock.h> |
|
98 |
||
99 |
/* |
|
100 |
* Check if a write lock can be grabbed, or wait and recheck until available. |
|
101 |
*/ |
|
102 |
static void |
|
103 |
zfs_range_lock_writer(znode_t *zp, rl_t *new) |
|
104 |
{ |
|
105 |
avl_tree_t *tree = &zp->z_range_avl; |
|
106 |
rl_t *rl; |
|
107 |
avl_index_t where; |
|
108 |
uint64_t end_size; |
|
109 |
uint64_t off = new->r_off; |
|
110 |
uint64_t len = new->r_len; |
|
111 |
||
112 |
for (;;) { |
|
113 |
/* |
|
3755
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
114 |
* Range locking is also used by zvol and uses a |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
115 |
* dummied up znode. However, for zvol, we don't need to |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
116 |
* append or grow blocksize, and besides we don't have |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
117 |
* a z_phys or z_zfsvfs - so skip that processing. |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
118 |
* |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
119 |
* Yes, this is ugly, and would be solved by not handling |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
120 |
* grow or append in range lock code. If that was done then |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
121 |
* we could make the range locking code generically available |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
122 |
* to other non-zfs consumers. |
1669 | 123 |
*/ |
3755
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
124 |
if (zp->z_vnode) { /* caller is ZPL */ |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
125 |
/* |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
126 |
* If in append mode pick up the current end of file. |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
127 |
* This is done under z_range_lock to avoid races. |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
128 |
*/ |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
129 |
if (new->r_type == RL_APPEND) |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
130 |
new->r_off = zp->z_phys->zp_size; |
1669 | 131 |
|
3755
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
132 |
/* |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
133 |
* If we need to grow the block size then grab the whole |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
134 |
* file range. This is also done under z_range_lock to |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
135 |
* avoid races. |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
136 |
*/ |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
137 |
end_size = MAX(zp->z_phys->zp_size, new->r_off + len); |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
138 |
if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
139 |
zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
140 |
new->r_off = 0; |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
141 |
new->r_len = UINT64_MAX; |
8708c35cb823
6525008 panic: dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC, file: ../../common/fs/zfs/dbuf.c, line: 676
perrin
parents:
2237
diff
changeset
|
142 |
} |
1669 | 143 |
} |
144 |
||
145 |
/* |
|
146 |
* First check for the usual case of no locks |
|
147 |
*/ |
|
148 |
if (avl_numnodes(tree) == 0) { |
|
149 |
new->r_type = RL_WRITER; /* convert to writer */ |
|
150 |
avl_add(tree, new); |
|
151 |
return; |
|
152 |
} |
|
153 |
||
154 |
/* |
|
155 |
* Look for any locks in the range. |
|
156 |
*/ |
|
157 |
rl = avl_find(tree, new, &where); |
|
158 |
if (rl) |
|
159 |
goto wait; /* already locked at same offset */ |
|
160 |
||
161 |
rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); |
|
162 |
if (rl && (rl->r_off < new->r_off + new->r_len)) |
|
163 |
goto wait; |
|
164 |
||
165 |
rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); |
|
166 |
if (rl && rl->r_off + rl->r_len > new->r_off) |
|
167 |
goto wait; |
|
168 |
||
169 |
new->r_type = RL_WRITER; /* convert possible RL_APPEND */ |
|
170 |
avl_insert(tree, new, where); |
|
171 |
return; |
|
172 |
wait: |
|
173 |
if (!rl->r_write_wanted) { |
|
174 |
cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); |
|
175 |
rl->r_write_wanted = B_TRUE; |
|
176 |
} |
|
177 |
cv_wait(&rl->r_wr_cv, &zp->z_range_lock); |
|
178 |
||
179 |
/* reset to original */ |
|
180 |
new->r_off = off; |
|
181 |
new->r_len = len; |
|
182 |
} |
|
183 |
} |
|
184 |
||
185 |
/* |
|
186 |
* If this is an original (non-proxy) lock then replace it by |
|
187 |
* a proxy and return the proxy. |
|
188 |
*/ |
|
189 |
static rl_t * |
|
190 |
zfs_range_proxify(avl_tree_t *tree, rl_t *rl) |
|
191 |
{ |
|
192 |
rl_t *proxy; |
|
193 |
||
194 |
if (rl->r_proxy) |
|
195 |
return (rl); /* already a proxy */ |
|
196 |
||
197 |
ASSERT3U(rl->r_cnt, ==, 1); |
|
198 |
ASSERT(rl->r_write_wanted == B_FALSE); |
|
199 |
ASSERT(rl->r_read_wanted == B_FALSE); |
|
200 |
avl_remove(tree, rl); |
|
201 |
rl->r_cnt = 0; |
|
202 |
||
203 |
/* create a proxy range lock */ |
|
204 |
proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
|
205 |
proxy->r_off = rl->r_off; |
|
206 |
proxy->r_len = rl->r_len; |
|
207 |
proxy->r_cnt = 1; |
|
208 |
proxy->r_type = RL_READER; |
|
209 |
proxy->r_proxy = B_TRUE; |
|
210 |
proxy->r_write_wanted = B_FALSE; |
|
211 |
proxy->r_read_wanted = B_FALSE; |
|
212 |
avl_add(tree, proxy); |
|
213 |
||
214 |
return (proxy); |
|
215 |
} |
|
216 |
||
217 |
/* |
|
218 |
* Split the range lock at the supplied offset |
|
219 |
* returning the *front* proxy. |
|
220 |
*/ |
|
221 |
static rl_t * |
|
222 |
zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) |
|
223 |
{ |
|
224 |
rl_t *front, *rear; |
|
225 |
||
226 |
ASSERT3U(rl->r_len, >, 1); |
|
227 |
ASSERT3U(off, >, rl->r_off); |
|
228 |
ASSERT3U(off, <, rl->r_off + rl->r_len); |
|
229 |
ASSERT(rl->r_write_wanted == B_FALSE); |
|
230 |
ASSERT(rl->r_read_wanted == B_FALSE); |
|
231 |
||
232 |
/* create the rear proxy range lock */ |
|
233 |
rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
|
234 |
rear->r_off = off; |
|
235 |
rear->r_len = rl->r_off + rl->r_len - off; |
|
236 |
rear->r_cnt = rl->r_cnt; |
|
237 |
rear->r_type = RL_READER; |
|
238 |
rear->r_proxy = B_TRUE; |
|
239 |
rear->r_write_wanted = B_FALSE; |
|
240 |
rear->r_read_wanted = B_FALSE; |
|
241 |
||
242 |
front = zfs_range_proxify(tree, rl); |
|
243 |
front->r_len = off - rl->r_off; |
|
244 |
||
245 |
avl_insert_here(tree, rear, front, AVL_AFTER); |
|
246 |
return (front); |
|
247 |
} |
|
248 |
||
249 |
/* |
|
250 |
* Create and add a new proxy range lock for the supplied range. |
|
251 |
*/ |
|
252 |
static void |
|
253 |
zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) |
|
254 |
{ |
|
255 |
rl_t *rl; |
|
256 |
||
257 |
ASSERT(len); |
|
258 |
rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
|
259 |
rl->r_off = off; |
|
260 |
rl->r_len = len; |
|
261 |
rl->r_cnt = 1; |
|
262 |
rl->r_type = RL_READER; |
|
263 |
rl->r_proxy = B_TRUE; |
|
264 |
rl->r_write_wanted = B_FALSE; |
|
265 |
rl->r_read_wanted = B_FALSE; |
|
266 |
avl_add(tree, rl); |
|
267 |
} |
|
268 |
||
269 |
static void |
|
270 |
zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) |
|
271 |
{ |
|
272 |
rl_t *next; |
|
273 |
uint64_t off = new->r_off; |
|
274 |
uint64_t len = new->r_len; |
|
275 |
||
276 |
/* |
|
277 |
* prev arrives either: |
|
278 |
* - pointing to an entry at the same offset |
|
279 |
* - pointing to the entry with the closest previous offset whose |
|
280 |
* range may overlap with the new range |
|
281 |
* - null, if there were no ranges starting before the new one |
|
282 |
*/ |
|
283 |
if (prev) { |
|
284 |
if (prev->r_off + prev->r_len <= off) { |
|
285 |
prev = NULL; |
|
286 |
} else if (prev->r_off != off) { |
|
287 |
/* |
|
288 |
* convert to proxy if needed then |
|
289 |
* split this entry and bump ref count |
|
290 |
*/ |
|
291 |
prev = zfs_range_split(tree, prev, off); |
|
292 |
prev = AVL_NEXT(tree, prev); /* move to rear range */ |
|
293 |
} |
|
294 |
} |
|
295 |
ASSERT((prev == NULL) || (prev->r_off == off)); |
|
296 |
||
297 |
if (prev) |
|
298 |
next = prev; |
|
299 |
else |
|
300 |
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); |
|
301 |
||
302 |
if (next == NULL || off + len <= next->r_off) { |
|
303 |
/* no overlaps, use the original new rl_t in the tree */ |
|
304 |
avl_insert(tree, new, where); |
|
305 |
return; |
|
306 |
} |
|
307 |
||
308 |
if (off < next->r_off) { |
|
309 |
/* Add a proxy for initial range before the overlap */ |
|
310 |
zfs_range_new_proxy(tree, off, next->r_off - off); |
|
311 |
} |
|
312 |
||
313 |
new->r_cnt = 0; /* will use proxies in tree */ |
|
314 |
/* |
|
315 |
* We now search forward through the ranges, until we go past the end |
|
316 |
* of the new range. For each entry we make it a proxy if it |
|
317 |
* isn't already, then bump its reference count. If there's any |
|
318 |
* gaps between the ranges then we create a new proxy range. |
|
319 |
*/ |
|
320 |
for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { |
|
321 |
if (off + len <= next->r_off) |
|
322 |
break; |
|
323 |
if (prev && prev->r_off + prev->r_len < next->r_off) { |
|
324 |
/* there's a gap */ |
|
325 |
ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); |
|
326 |
zfs_range_new_proxy(tree, prev->r_off + prev->r_len, |
|
327 |
next->r_off - (prev->r_off + prev->r_len)); |
|
328 |
} |
|
329 |
if (off + len == next->r_off + next->r_len) { |
|
330 |
/* exact overlap with end */ |
|
331 |
next = zfs_range_proxify(tree, next); |
|
332 |
next->r_cnt++; |
|
333 |
return; |
|
334 |
} |
|
335 |
if (off + len < next->r_off + next->r_len) { |
|
336 |
/* new range ends in the middle of this block */ |
|
337 |
next = zfs_range_split(tree, next, off + len); |
|
338 |
next->r_cnt++; |
|
339 |
return; |
|
340 |
} |
|
341 |
ASSERT3U(off + len, >, next->r_off + next->r_len); |
|
342 |
next = zfs_range_proxify(tree, next); |
|
343 |
next->r_cnt++; |
|
344 |
} |
|
345 |
||
346 |
/* Add the remaining end range. */ |
|
347 |
zfs_range_new_proxy(tree, prev->r_off + prev->r_len, |
|
348 |
(off + len) - (prev->r_off + prev->r_len)); |
|
349 |
} |
|
350 |
||
351 |
/* |
|
352 |
* Check if a reader lock can be grabbed, or wait and recheck until available. |
|
353 |
*/ |
|
354 |
static void |
|
355 |
zfs_range_lock_reader(znode_t *zp, rl_t *new) |
|
356 |
{ |
|
357 |
avl_tree_t *tree = &zp->z_range_avl; |
|
358 |
rl_t *prev, *next; |
|
359 |
avl_index_t where; |
|
360 |
uint64_t off = new->r_off; |
|
361 |
uint64_t len = new->r_len; |
|
362 |
||
363 |
/* |
|
364 |
* Look for any writer locks in the range. |
|
365 |
*/ |
|
366 |
retry: |
|
367 |
prev = avl_find(tree, new, &where); |
|
368 |
if (prev == NULL) |
|
369 |
prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); |
|
370 |
||
371 |
/* |
|
372 |
* Check the previous range for a writer lock overlap. |
|
373 |
*/ |
|
374 |
if (prev && (off < prev->r_off + prev->r_len)) { |
|
375 |
if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { |
|
376 |
if (!prev->r_read_wanted) { |
|
377 |
cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); |
|
378 |
prev->r_read_wanted = B_TRUE; |
|
379 |
} |
|
380 |
cv_wait(&prev->r_rd_cv, &zp->z_range_lock); |
|
381 |
goto retry; |
|
382 |
} |
|
383 |
if (off + len < prev->r_off + prev->r_len) |
|
384 |
goto got_lock; |
|
385 |
} |
|
386 |
||
387 |
/* |
|
388 |
* Search through the following ranges to see if there's |
|
389 |
* write lock any overlap. |
|
390 |
*/ |
|
391 |
if (prev) |
|
392 |
next = AVL_NEXT(tree, prev); |
|
393 |
else |
|
394 |
next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); |
|
395 |
for (; next; next = AVL_NEXT(tree, next)) { |
|
396 |
if (off + len <= next->r_off) |
|
397 |
goto got_lock; |
|
398 |
if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { |
|
399 |
if (!next->r_read_wanted) { |
|
400 |
cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); |
|
401 |
next->r_read_wanted = B_TRUE; |
|
402 |
} |
|
403 |
cv_wait(&next->r_rd_cv, &zp->z_range_lock); |
|
404 |
goto retry; |
|
405 |
} |
|
406 |
if (off + len <= next->r_off + next->r_len) |
|
407 |
goto got_lock; |
|
408 |
} |
|
409 |
||
410 |
got_lock: |
|
411 |
/* |
|
412 |
* Add the read lock, which may involve splitting existing |
|
413 |
* locks and bumping ref counts (r_cnt). |
|
414 |
*/ |
|
415 |
zfs_range_add_reader(tree, new, prev, where); |
|
416 |
} |
|
417 |
||
418 |
/* |
|
419 |
* Lock a range (offset, length) as either shared (RL_READER) |
|
420 |
* or exclusive (RL_WRITER). Returns the range lock structure |
|
421 |
* for later unlocking or reduce range (if entire file |
|
422 |
* previously locked as RL_WRITER). |
|
423 |
*/ |
|
424 |
rl_t * |
|
425 |
zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) |
|
426 |
{ |
|
427 |
rl_t *new; |
|
428 |
||
429 |
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); |
|
430 |
||
431 |
new = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
|
2237 | 432 |
new->r_zp = zp; |
1669 | 433 |
new->r_off = off; |
434 |
new->r_len = len; |
|
435 |
new->r_cnt = 1; /* assume it's going to be in the tree */ |
|
436 |
new->r_type = type; |
|
437 |
new->r_proxy = B_FALSE; |
|
438 |
new->r_write_wanted = B_FALSE; |
|
439 |
new->r_read_wanted = B_FALSE; |
|
440 |
||
441 |
mutex_enter(&zp->z_range_lock); |
|
442 |
if (type == RL_READER) { |
|
443 |
/* |
|
444 |
* First check for the usual case of no locks |
|
445 |
*/ |
|
446 |
if (avl_numnodes(&zp->z_range_avl) == 0) |
|
447 |
avl_add(&zp->z_range_avl, new); |
|
448 |
else |
|
449 |
zfs_range_lock_reader(zp, new); |
|
450 |
} else |
|
451 |
zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ |
|
452 |
mutex_exit(&zp->z_range_lock); |
|
453 |
return (new); |
|
454 |
} |
|
455 |
||
456 |
/* |
|
457 |
* Unlock a reader lock |
|
458 |
*/ |
|
459 |
static void |
|
460 |
zfs_range_unlock_reader(znode_t *zp, rl_t *remove) |
|
461 |
{ |
|
462 |
avl_tree_t *tree = &zp->z_range_avl; |
|
463 |
rl_t *rl, *next; |
|
464 |
uint64_t len; |
|
465 |
||
466 |
/* |
|
467 |
* The common case is when the remove entry is in the tree |
|
468 |
* (cnt == 1) meaning there's been no other reader locks overlapping |
|
469 |
* with this one. Otherwise the remove entry will have been |
|
470 |
* removed from the tree and replaced by proxies (one or |
|
471 |
* more ranges mapping to the entire range). |
|
472 |
*/ |
|
473 |
if (remove->r_cnt == 1) { |
|
474 |
avl_remove(tree, remove); |
|
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
475 |
if (remove->r_write_wanted) { |
1669 | 476 |
cv_broadcast(&remove->r_wr_cv); |
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
477 |
cv_destroy(&remove->r_wr_cv); |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
478 |
} |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
479 |
if (remove->r_read_wanted) { |
1669 | 480 |
cv_broadcast(&remove->r_rd_cv); |
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
481 |
cv_destroy(&remove->r_rd_cv); |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
482 |
} |
1669 | 483 |
} else { |
484 |
ASSERT3U(remove->r_cnt, ==, 0); |
|
485 |
ASSERT3U(remove->r_write_wanted, ==, 0); |
|
486 |
ASSERT3U(remove->r_read_wanted, ==, 0); |
|
487 |
/* |
|
488 |
* Find start proxy representing this reader lock, |
|
489 |
* then decrement ref count on all proxies |
|
490 |
* that make up this range, freeing them as needed. |
|
491 |
*/ |
|
492 |
rl = avl_find(tree, remove, NULL); |
|
493 |
ASSERT(rl); |
|
494 |
ASSERT(rl->r_cnt); |
|
495 |
ASSERT(rl->r_type == RL_READER); |
|
496 |
for (len = remove->r_len; len != 0; rl = next) { |
|
497 |
len -= rl->r_len; |
|
498 |
if (len) { |
|
499 |
next = AVL_NEXT(tree, rl); |
|
500 |
ASSERT(next); |
|
501 |
ASSERT(rl->r_off + rl->r_len == next->r_off); |
|
502 |
ASSERT(next->r_cnt); |
|
503 |
ASSERT(next->r_type == RL_READER); |
|
504 |
} |
|
505 |
rl->r_cnt--; |
|
506 |
if (rl->r_cnt == 0) { |
|
507 |
avl_remove(tree, rl); |
|
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
508 |
if (rl->r_write_wanted) { |
1669 | 509 |
cv_broadcast(&rl->r_wr_cv); |
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
510 |
cv_destroy(&rl->r_wr_cv); |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
511 |
} |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
512 |
if (rl->r_read_wanted) { |
1669 | 513 |
cv_broadcast(&rl->r_rd_cv); |
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
514 |
cv_destroy(&rl->r_rd_cv); |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
515 |
} |
1669 | 516 |
kmem_free(rl, sizeof (rl_t)); |
517 |
} |
|
518 |
} |
|
519 |
} |
|
520 |
kmem_free(remove, sizeof (rl_t)); |
|
521 |
} |
|
522 |
||
523 |
/* |
|
524 |
* Unlock range and destroy range lock structure. |
|
525 |
*/ |
|
526 |
void |
|
2237 | 527 |
zfs_range_unlock(rl_t *rl) |
1669 | 528 |
{ |
2237 | 529 |
znode_t *zp = rl->r_zp; |
530 |
||
1669 | 531 |
ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); |
532 |
ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); |
|
533 |
ASSERT(!rl->r_proxy); |
|
534 |
||
535 |
mutex_enter(&zp->z_range_lock); |
|
536 |
if (rl->r_type == RL_WRITER) { |
|
537 |
/* writer locks can't be shared or split */ |
|
538 |
avl_remove(&zp->z_range_avl, rl); |
|
539 |
mutex_exit(&zp->z_range_lock); |
|
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
540 |
if (rl->r_write_wanted) { |
1669 | 541 |
cv_broadcast(&rl->r_wr_cv); |
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
542 |
cv_destroy(&rl->r_wr_cv); |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
543 |
} |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
544 |
if (rl->r_read_wanted) { |
1669 | 545 |
cv_broadcast(&rl->r_rd_cv); |
4831
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
546 |
cv_destroy(&rl->r_rd_cv); |
41ec732c6d9f
6584470 zdb needs to initialize the bpl_lock mutex
gw25295
parents:
3755
diff
changeset
|
547 |
} |
1669 | 548 |
kmem_free(rl, sizeof (rl_t)); |
549 |
} else { |
|
550 |
/* |
|
551 |
* lock may be shared, let zfs_range_unlock_reader() |
|
552 |
* release the lock and free the rl_t |
|
553 |
*/ |
|
554 |
zfs_range_unlock_reader(zp, rl); |
|
555 |
mutex_exit(&zp->z_range_lock); |
|
556 |
} |
|
557 |
} |
|
558 |
||
559 |
/* |
|
560 |
* Reduce range locked as RL_WRITER from whole file to specified range. |
|
561 |
* Asserts the whole file is exclusivly locked and so there's only one |
|
562 |
* entry in the tree. |
|
563 |
*/ |
|
564 |
void |
|
2237 | 565 |
zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) |
1669 | 566 |
{ |
2237 | 567 |
znode_t *zp = rl->r_zp; |
568 |
||
1669 | 569 |
/* Ensure there are no other locks */ |
570 |
ASSERT(avl_numnodes(&zp->z_range_avl) == 1); |
|
571 |
ASSERT(rl->r_off == 0); |
|
572 |
ASSERT(rl->r_type == RL_WRITER); |
|
573 |
ASSERT(!rl->r_proxy); |
|
574 |
ASSERT3U(rl->r_len, ==, UINT64_MAX); |
|
575 |
ASSERT3U(rl->r_cnt, ==, 1); |
|
576 |
||
577 |
mutex_enter(&zp->z_range_lock); |
|
578 |
rl->r_off = off; |
|
579 |
rl->r_len = len; |
|
580 |
mutex_exit(&zp->z_range_lock); |
|
581 |
if (rl->r_write_wanted) |
|
582 |
cv_broadcast(&rl->r_wr_cv); |
|
583 |
if (rl->r_read_wanted) |
|
584 |
cv_broadcast(&rl->r_rd_cv); |
|
585 |
} |
|
586 |
||
587 |
/* |
|
588 |
* AVL comparison function used to order range locks |
|
589 |
* Locks are ordered on the start offset of the range. |
|
590 |
*/ |
|
591 |
int |
|
592 |
zfs_range_compare(const void *arg1, const void *arg2) |
|
593 |
{ |
|
594 |
const rl_t *rl1 = arg1; |
|
595 |
const rl_t *rl2 = arg2; |
|
596 |
||
597 |
if (rl1->r_off > rl2->r_off) |
|
598 |
return (1); |
|
599 |
if (rl1->r_off < rl2->r_off) |
|
600 |
return (-1); |
|
601 |
return (0); |
|
602 |
} |