789
|
1 |
/*
|
|
2 |
* CDDL HEADER START
|
|
3 |
*
|
|
4 |
* The contents of this file are subject to the terms of the
|
1544
|
5 |
* Common Development and Distribution License (the "License").
|
|
6 |
* You may not use this file except in compliance with the License.
|
789
|
7 |
*
|
|
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
9 |
* or http://www.opensolaris.org/os/licensing.
|
|
10 |
* See the License for the specific language governing permissions
|
|
11 |
* and limitations under the License.
|
|
12 |
*
|
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each
|
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
15 |
* If applicable, add the following below this CDDL HEADER, with the
|
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
18 |
*
|
|
19 |
* CDDL HEADER END
|
|
20 |
*/
|
|
21 |
/*
|
1544
|
22 |
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
|
789
|
23 |
* Use is subject to license terms.
|
|
24 |
*/
|
|
25 |
|
|
26 |
#pragma ident "%Z%%M% %I% %E% SMI"
|
|
27 |
|
|
28 |
#include <sys/zfs_context.h>
|
|
29 |
#include <sys/spa.h>
|
|
30 |
#include <sys/vdev_impl.h>
|
|
31 |
#include <sys/zio.h>
|
|
32 |
#include <sys/avl.h>
|
|
33 |
|
|
34 |
/*
|
|
35 |
* Virtual device vector for disk I/O scheduling.
|
|
36 |
*/
|
|
37 |
int
|
|
38 |
vdev_queue_deadline_compare(const void *x1, const void *x2)
|
|
39 |
{
|
|
40 |
const zio_t *z1 = x1;
|
|
41 |
const zio_t *z2 = x2;
|
|
42 |
|
|
43 |
if (z1->io_deadline < z2->io_deadline)
|
|
44 |
return (-1);
|
|
45 |
if (z1->io_deadline > z2->io_deadline)
|
|
46 |
return (1);
|
|
47 |
|
|
48 |
if (z1->io_offset < z2->io_offset)
|
|
49 |
return (-1);
|
|
50 |
if (z1->io_offset > z2->io_offset)
|
|
51 |
return (1);
|
|
52 |
|
|
53 |
if (z1 < z2)
|
|
54 |
return (-1);
|
|
55 |
if (z1 > z2)
|
|
56 |
return (1);
|
|
57 |
|
|
58 |
return (0);
|
|
59 |
}
|
|
60 |
|
|
61 |
int
|
|
62 |
vdev_queue_offset_compare(const void *x1, const void *x2)
|
|
63 |
{
|
|
64 |
const zio_t *z1 = x1;
|
|
65 |
const zio_t *z2 = x2;
|
|
66 |
|
|
67 |
if (z1->io_offset < z2->io_offset)
|
|
68 |
return (-1);
|
|
69 |
if (z1->io_offset > z2->io_offset)
|
|
70 |
return (1);
|
|
71 |
|
|
72 |
if (z1 < z2)
|
|
73 |
return (-1);
|
|
74 |
if (z1 > z2)
|
|
75 |
return (1);
|
|
76 |
|
|
77 |
return (0);
|
|
78 |
}
|
|
79 |
|
|
80 |
void
|
|
81 |
vdev_queue_init(vdev_t *vd)
|
|
82 |
{
|
|
83 |
vdev_queue_t *vq = &vd->vdev_queue;
|
|
84 |
|
|
85 |
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
|
|
86 |
|
|
87 |
avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
|
|
88 |
sizeof (zio_t), offsetof(struct zio, io_deadline_node));
|
|
89 |
|
|
90 |
avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
|
|
91 |
sizeof (zio_t), offsetof(struct zio, io_offset_node));
|
|
92 |
|
|
93 |
avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
|
|
94 |
sizeof (zio_t), offsetof(struct zio, io_offset_node));
|
|
95 |
|
|
96 |
avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
|
|
97 |
sizeof (zio_t), offsetof(struct zio, io_offset_node));
|
|
98 |
}
|
|
99 |
|
|
100 |
void
|
|
101 |
vdev_queue_fini(vdev_t *vd)
|
|
102 |
{
|
|
103 |
vdev_queue_t *vq = &vd->vdev_queue;
|
|
104 |
|
1544
|
105 |
ASSERT(vq->vq_scrub_count == 0);
|
|
106 |
|
789
|
107 |
avl_destroy(&vq->vq_deadline_tree);
|
|
108 |
avl_destroy(&vq->vq_read_tree);
|
|
109 |
avl_destroy(&vq->vq_write_tree);
|
|
110 |
avl_destroy(&vq->vq_pending_tree);
|
|
111 |
|
|
112 |
mutex_destroy(&vq->vq_lock);
|
|
113 |
}
|
|
114 |
|
|
115 |
static void
|
1544
|
116 |
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
|
|
117 |
{
|
|
118 |
avl_add(&vq->vq_deadline_tree, zio);
|
|
119 |
avl_add(zio->io_vdev_tree, zio);
|
|
120 |
|
|
121 |
if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
|
|
122 |
++vq->vq_scrub_count >= vq->vq_scrub_limit)
|
|
123 |
spa_scrub_throttle(zio->io_spa, 1);
|
|
124 |
}
|
|
125 |
|
|
126 |
static void
|
|
127 |
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
|
|
128 |
{
|
|
129 |
if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
|
|
130 |
vq->vq_scrub_count-- >= vq->vq_scrub_limit)
|
|
131 |
spa_scrub_throttle(zio->io_spa, -1);
|
|
132 |
|
|
133 |
avl_remove(&vq->vq_deadline_tree, zio);
|
|
134 |
avl_remove(zio->io_vdev_tree, zio);
|
|
135 |
}
|
|
136 |
|
|
137 |
static void
|
789
|
138 |
vdev_queue_agg_io_done(zio_t *aio)
|
|
139 |
{
|
|
140 |
zio_t *dio;
|
|
141 |
uint64_t offset = 0;
|
|
142 |
|
|
143 |
while ((dio = aio->io_delegate_list) != NULL) {
|
|
144 |
if (aio->io_type == ZIO_TYPE_READ)
|
|
145 |
bcopy((char *)aio->io_data + offset, dio->io_data,
|
|
146 |
dio->io_size);
|
|
147 |
offset += dio->io_size;
|
|
148 |
aio->io_delegate_list = dio->io_delegate_next;
|
|
149 |
dio->io_delegate_next = NULL;
|
|
150 |
dio->io_error = aio->io_error;
|
|
151 |
zio_next_stage(dio);
|
|
152 |
}
|
|
153 |
ASSERT3U(offset, ==, aio->io_size);
|
|
154 |
|
|
155 |
zio_buf_free(aio->io_data, aio->io_size);
|
|
156 |
}
|
|
157 |
|
|
158 |
#define IS_ADJACENT(io, nio) \
|
|
159 |
((io)->io_offset + (io)->io_size == (nio)->io_offset)
|
|
160 |
|
|
161 |
typedef void zio_issue_func_t(zio_t *);
|
|
162 |
|
|
163 |
static zio_t *
|
|
164 |
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
|
|
165 |
zio_issue_func_t **funcp)
|
|
166 |
{
|
|
167 |
zio_t *fio, *lio, *aio, *dio;
|
|
168 |
avl_tree_t *tree;
|
|
169 |
uint64_t size;
|
|
170 |
|
|
171 |
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
|
172 |
|
|
173 |
*funcp = NULL;
|
|
174 |
|
|
175 |
if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
|
|
176 |
avl_numnodes(&vq->vq_deadline_tree) == 0)
|
|
177 |
return (NULL);
|
|
178 |
|
|
179 |
fio = lio = avl_first(&vq->vq_deadline_tree);
|
|
180 |
|
|
181 |
tree = fio->io_vdev_tree;
|
|
182 |
size = fio->io_size;
|
|
183 |
|
|
184 |
while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
|
|
185 |
size + dio->io_size <= vq->vq_agg_limit) {
|
|
186 |
dio->io_delegate_next = fio;
|
|
187 |
fio = dio;
|
|
188 |
size += dio->io_size;
|
|
189 |
}
|
|
190 |
|
|
191 |
while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
|
|
192 |
size + dio->io_size <= vq->vq_agg_limit) {
|
|
193 |
lio->io_delegate_next = dio;
|
|
194 |
lio = dio;
|
|
195 |
size += dio->io_size;
|
|
196 |
}
|
|
197 |
|
|
198 |
if (fio != lio) {
|
|
199 |
char *buf = zio_buf_alloc(size);
|
|
200 |
uint64_t offset = 0;
|
|
201 |
int nagg = 0;
|
|
202 |
|
|
203 |
ASSERT(size <= vq->vq_agg_limit);
|
|
204 |
|
|
205 |
aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
|
|
206 |
fio->io_offset, buf, size, fio->io_type,
|
|
207 |
ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
|
1544
|
208 |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
|
|
209 |
ZIO_FLAG_NOBOOKMARK,
|
789
|
210 |
vdev_queue_agg_io_done, NULL);
|
|
211 |
|
|
212 |
aio->io_delegate_list = fio;
|
|
213 |
|
|
214 |
for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
|
|
215 |
ASSERT(dio->io_type == aio->io_type);
|
1544
|
216 |
ASSERT(dio->io_vdev_tree == tree);
|
789
|
217 |
if (dio->io_type == ZIO_TYPE_WRITE)
|
|
218 |
bcopy(dio->io_data, buf + offset, dio->io_size);
|
|
219 |
offset += dio->io_size;
|
1544
|
220 |
vdev_queue_io_remove(vq, dio);
|
789
|
221 |
zio_vdev_io_bypass(dio);
|
|
222 |
nagg++;
|
|
223 |
}
|
|
224 |
|
|
225 |
ASSERT(offset == size);
|
|
226 |
|
|
227 |
dprintf("%5s T=%llu off=%8llx agg=%3d "
|
|
228 |
"old=%5llx new=%5llx\n",
|
|
229 |
zio_type_name[fio->io_type],
|
|
230 |
fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
|
|
231 |
|
|
232 |
avl_add(&vq->vq_pending_tree, aio);
|
|
233 |
|
|
234 |
*funcp = zio_nowait;
|
|
235 |
return (aio);
|
|
236 |
}
|
|
237 |
|
1544
|
238 |
ASSERT(fio->io_vdev_tree == tree);
|
|
239 |
vdev_queue_io_remove(vq, fio);
|
789
|
240 |
|
|
241 |
avl_add(&vq->vq_pending_tree, fio);
|
|
242 |
|
|
243 |
*funcp = zio_next_stage;
|
|
244 |
|
|
245 |
return (fio);
|
|
246 |
}
|
|
247 |
|
|
248 |
zio_t *
|
|
249 |
vdev_queue_io(zio_t *zio)
|
|
250 |
{
|
|
251 |
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
|
252 |
zio_t *nio;
|
|
253 |
zio_issue_func_t *func;
|
|
254 |
|
|
255 |
ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
|
|
256 |
|
|
257 |
if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
|
|
258 |
return (zio);
|
|
259 |
|
|
260 |
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
|
|
261 |
|
|
262 |
if (zio->io_type == ZIO_TYPE_READ)
|
|
263 |
zio->io_vdev_tree = &vq->vq_read_tree;
|
|
264 |
else
|
|
265 |
zio->io_vdev_tree = &vq->vq_write_tree;
|
|
266 |
|
|
267 |
mutex_enter(&vq->vq_lock);
|
|
268 |
|
|
269 |
zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
|
|
270 |
zio->io_priority;
|
|
271 |
|
1544
|
272 |
vdev_queue_io_add(vq, zio);
|
789
|
273 |
|
|
274 |
nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
|
|
275 |
|
|
276 |
mutex_exit(&vq->vq_lock);
|
|
277 |
|
|
278 |
if (nio == NULL || func != zio_nowait)
|
|
279 |
return (nio);
|
|
280 |
|
|
281 |
func(nio);
|
|
282 |
return (NULL);
|
|
283 |
}
|
|
284 |
|
|
285 |
void
|
|
286 |
vdev_queue_io_done(zio_t *zio)
|
|
287 |
{
|
|
288 |
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
|
289 |
zio_t *nio;
|
|
290 |
zio_issue_func_t *func;
|
|
291 |
int i;
|
|
292 |
|
|
293 |
mutex_enter(&vq->vq_lock);
|
|
294 |
|
|
295 |
avl_remove(&vq->vq_pending_tree, zio);
|
|
296 |
|
|
297 |
for (i = 0; i < vq->vq_ramp_rate; i++) {
|
|
298 |
nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func);
|
|
299 |
if (nio == NULL)
|
|
300 |
break;
|
|
301 |
mutex_exit(&vq->vq_lock);
|
|
302 |
if (func == zio_next_stage)
|
|
303 |
zio_vdev_io_reissue(nio);
|
|
304 |
func(nio);
|
|
305 |
mutex_enter(&vq->vq_lock);
|
|
306 |
}
|
|
307 |
|
|
308 |
mutex_exit(&vq->vq_lock);
|
|
309 |
}
|