741
|
1 |
/*
|
|
2 |
* CDDL HEADER START
|
|
3 |
*
|
|
4 |
* The contents of this file are subject to the terms of the
|
|
5 |
* Common Development and Distribution License, Version 1.0 only
|
|
6 |
* (the "License"). You may not use this file except in compliance
|
|
7 |
* with the License.
|
|
8 |
*
|
|
9 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
10 |
* or http://www.opensolaris.org/os/licensing.
|
|
11 |
* See the License for the specific language governing permissions
|
|
12 |
* and limitations under the License.
|
|
13 |
*
|
|
14 |
* When distributing Covered Code, include this CDDL HEADER in each
|
|
15 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
16 |
* If applicable, add the following below this CDDL HEADER, with the
|
|
17 |
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
18 |
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
19 |
*
|
|
20 |
* CDDL HEADER END
|
|
21 |
*/
|
|
22 |
/*
|
|
23 |
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
|
|
24 |
* Use is subject to license terms.
|
|
25 |
*/
|
|
26 |
|
|
27 |
#pragma ident "%Z%%M% %I% %E% SMI"
|
|
28 |
|
|
29 |
#include <sys/types.h>
|
|
30 |
#include <sys/stream.h>
|
|
31 |
#include <sys/strsun.h>
|
|
32 |
#include <sys/strsubr.h>
|
|
33 |
#include <sys/debug.h>
|
|
34 |
#include <sys/cmn_err.h>
|
|
35 |
#include <sys/tihdr.h>
|
|
36 |
|
|
37 |
#include <inet/common.h>
|
|
38 |
#include <inet/ip.h>
|
|
39 |
#include <inet/ip_impl.h>
|
|
40 |
#include <inet/tcp.h>
|
|
41 |
#include <inet/tcp_impl.h>
|
|
42 |
#include <inet/ipsec_impl.h>
|
|
43 |
#include <inet/ipclassifier.h>
|
|
44 |
#include <inet/ipp_common.h>
|
|
45 |
|
|
46 |
/*
|
|
47 |
* This file implements TCP fusion - a protocol-less data path for TCP
|
|
48 |
* loopback connections. The fusion of two local TCP endpoints occurs
|
|
49 |
* at connection establishment time. Various conditions (see details
|
|
50 |
* in tcp_fuse()) need to be met for fusion to be successful. If it
|
|
51 |
* fails, we fall back to the regular TCP data path; if it succeeds,
|
|
52 |
* both endpoints proceed to use tcp_fuse_output() as the transmit path.
|
|
53 |
* tcp_fuse_output() enqueues application data directly onto the peer's
|
|
54 |
* receive queue; no protocol processing is involved. After enqueueing
|
|
55 |
* the data, the sender can either push (putnext) data up the receiver's
|
|
56 |
* read queue; or the sender can simply return and let the receiver
|
|
57 |
* retrieve the enqueued data via the synchronous streams entry point
|
|
58 |
* tcp_fuse_rrw(). The latter path is taken if synchronous streams is
|
|
59 |
* enabled (the default). It is disabled if sockfs no longer resides
|
|
60 |
* directly on top of tcp module due to a module insertion or removal.
|
|
61 |
* It also needs to be temporarily disabled when sending urgent data
|
|
62 |
* because the tcp_fuse_rrw() path bypasses the M_PROTO processing done
|
|
63 |
* by strsock_proto() hook.
|
|
64 |
*
|
|
65 |
* Sychronization is handled by squeue and the mutex tcp_fuse_lock.
|
|
66 |
* One of the requirements for fusion to succeed is that both endpoints
|
|
67 |
* need to be using the same squeue. This ensures that neither side
|
|
68 |
* can disappear while the other side is still sending data. By itself,
|
|
69 |
* squeue is not sufficient for guaranteeing safety when synchronous
|
|
70 |
* streams is enabled. The reason is that tcp_fuse_rrw() doesn't enter
|
|
71 |
* the squeue and its access to tcp_rcv_list and other fusion-related
|
|
72 |
* fields needs to be sychronized with the sender. tcp_fuse_lock is
|
|
73 |
* used for this purpose. When there is urgent data, the sender needs
|
|
74 |
* to push the data up the receiver's streams read queue. In order to
|
|
75 |
* avoid holding the tcp_fuse_lock across putnext(), the sender sets
|
|
76 |
* the peer tcp's tcp_fuse_syncstr_stopped bit and releases tcp_fuse_lock
|
|
77 |
* (see macro TCP_FUSE_SYNCSTR_STOP()). If tcp_fuse_rrw() enters after
|
|
78 |
* this point, it will see that synchronous streams is temporarily
|
|
79 |
* stopped and it will immediately return EBUSY without accessing the
|
|
80 |
* tcp_rcv_list or other fields protected by the tcp_fuse_lock. This
|
|
81 |
* will result in strget() calling getq_noenab() to dequeue data from
|
|
82 |
* the stream head instead. After the sender has finished pushing up
|
|
83 |
* all urgent data, it will clear the tcp_fuse_syncstr_stopped bit using
|
|
84 |
* TCP_FUSE_SYNCSTR_RESUME and the receiver may then resume using
|
|
85 |
* tcp_fuse_rrw() to retrieve data from tcp_rcv_list.
|
|
86 |
*
|
|
87 |
* The following note applies only to the synchronous streams mode.
|
|
88 |
*
|
|
89 |
* Flow control is done by checking the size of receive buffer and
|
|
90 |
* the number of data blocks, both set to different limits. This is
|
|
91 |
* different than regular streams flow control where cumulative size
|
|
92 |
* check dominates block count check -- streams queue high water mark
|
|
93 |
* typically represents bytes. Each enqueue triggers notifications
|
|
94 |
* to the receiving process; a build up of data blocks indicates a
|
|
95 |
* slow receiver and the sender should be blocked or informed at the
|
|
96 |
* earliest moment instead of further wasting system resources. In
|
|
97 |
* effect, this is equivalent to limiting the number of outstanding
|
|
98 |
* segments in flight.
|
|
99 |
*/
|
|
100 |
|
|
101 |
/*
|
|
102 |
* Macros that determine whether or not IP processing is needed for TCP.
|
|
103 |
*/
|
|
104 |
#define TCP_IPOPT_POLICY_V4(tcp) \
|
|
105 |
((tcp)->tcp_ipversion == IPV4_VERSION && \
|
|
106 |
((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH || \
|
|
107 |
CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) || \
|
|
108 |
CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
|
|
109 |
|
|
110 |
#define TCP_IPOPT_POLICY_V6(tcp) \
|
|
111 |
((tcp)->tcp_ipversion == IPV6_VERSION && \
|
|
112 |
((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN || \
|
|
113 |
CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) || \
|
|
114 |
CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
|
|
115 |
|
|
116 |
#define TCP_LOOPBACK_IP(tcp) \
|
|
117 |
(TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) || \
|
|
118 |
!CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
|
|
119 |
|
|
120 |
/*
|
|
121 |
* Setting this to false means we disable fusion altogether and
|
|
122 |
* loopback connections would go through the protocol paths.
|
|
123 |
*/
|
|
124 |
boolean_t do_tcp_fusion = B_TRUE;
|
|
125 |
|
|
126 |
/*
|
|
127 |
* Enabling this flag allows sockfs to retrieve data directly
|
|
128 |
* from a fused tcp endpoint using synchronous streams interface.
|
|
129 |
*/
|
|
130 |
boolean_t do_tcp_direct_sockfs = B_TRUE;
|
|
131 |
|
|
132 |
/*
|
|
133 |
* This is the minimum amount of outstanding writes allowed on
|
|
134 |
* a synchronous streams-enabled receiving endpoint before the
|
|
135 |
* sender gets flow-controlled. Setting this value to 0 means
|
|
136 |
* that the data block limit is equivalent to the byte count
|
|
137 |
* limit, which essentially disables the check.
|
|
138 |
*/
|
|
139 |
#define TCP_FUSION_RCV_UNREAD_MIN 8
|
|
140 |
uint_t tcp_fusion_rcv_unread_min = TCP_FUSION_RCV_UNREAD_MIN;
|
|
141 |
|
|
142 |
static void tcp_fuse_syncstr_enable(tcp_t *);
|
|
143 |
static void tcp_fuse_syncstr_disable(tcp_t *);
|
|
144 |
static void strrput_sig(queue_t *, boolean_t);
|
|
145 |
|
|
146 |
/*
|
|
147 |
* This routine gets called by the eager tcp upon changing state from
|
|
148 |
* SYN_RCVD to ESTABLISHED. It fuses a direct path between itself
|
|
149 |
* and the active connect tcp such that the regular tcp processings
|
|
150 |
* may be bypassed under allowable circumstances. Because the fusion
|
|
151 |
* requires both endpoints to be in the same squeue, it does not work
|
|
152 |
* for simultaneous active connects because there is no easy way to
|
|
153 |
* switch from one squeue to another once the connection is created.
|
|
154 |
* This is different from the eager tcp case where we assign it the
|
|
155 |
* same squeue as the one given to the active connect tcp during open.
|
|
156 |
*/
|
|
157 |
void
|
|
158 |
tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
|
|
159 |
{
|
|
160 |
conn_t *peer_connp, *connp = tcp->tcp_connp;
|
|
161 |
tcp_t *peer_tcp;
|
|
162 |
|
|
163 |
ASSERT(!tcp->tcp_fused);
|
|
164 |
ASSERT(tcp->tcp_loopback);
|
|
165 |
ASSERT(tcp->tcp_loopback_peer == NULL);
|
|
166 |
/*
|
|
167 |
* We need to inherit q_hiwat of the listener tcp, but we can't
|
|
168 |
* really use tcp_listener since we get here after sending up
|
|
169 |
* T_CONN_IND and tcp_wput_accept() may be called independently,
|
|
170 |
* at which point tcp_listener is cleared; this is why we use
|
|
171 |
* tcp_saved_listener. The listener itself is guaranteed to be
|
|
172 |
* around until tcp_accept_finish() is called on this eager --
|
|
173 |
* this won't happen until we're done since we're inside the
|
|
174 |
* eager's perimeter now.
|
|
175 |
*/
|
|
176 |
ASSERT(tcp->tcp_saved_listener != NULL);
|
|
177 |
|
|
178 |
/*
|
|
179 |
* Lookup peer endpoint; search for the remote endpoint having
|
|
180 |
* the reversed address-port quadruplet in ESTABLISHED state,
|
|
181 |
* which is guaranteed to be unique in the system. Zone check
|
|
182 |
* is applied accordingly for loopback address, but not for
|
|
183 |
* local address since we want fusion to happen across Zones.
|
|
184 |
*/
|
|
185 |
if (tcp->tcp_ipversion == IPV4_VERSION) {
|
|
186 |
peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
|
|
187 |
(ipha_t *)iphdr, tcph);
|
|
188 |
} else {
|
|
189 |
peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
|
|
190 |
(ip6_t *)iphdr, tcph);
|
|
191 |
}
|
|
192 |
|
|
193 |
/*
|
|
194 |
* We can only proceed if peer exists, resides in the same squeue
|
|
195 |
* as our conn and is not raw-socket. The squeue assignment of
|
|
196 |
* this eager tcp was done earlier at the time of SYN processing
|
|
197 |
* in ip_fanout_tcp{_v6}. Note that similar squeues by itself
|
|
198 |
* doesn't guarantee a safe condition to fuse, hence we perform
|
|
199 |
* additional tests below.
|
|
200 |
*/
|
|
201 |
ASSERT(peer_connp == NULL || peer_connp != connp);
|
|
202 |
if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
|
|
203 |
!IPCL_IS_TCP(peer_connp)) {
|
|
204 |
if (peer_connp != NULL) {
|
|
205 |
TCP_STAT(tcp_fusion_unqualified);
|
|
206 |
CONN_DEC_REF(peer_connp);
|
|
207 |
}
|
|
208 |
return;
|
|
209 |
}
|
|
210 |
peer_tcp = peer_connp->conn_tcp; /* active connect tcp */
|
|
211 |
|
|
212 |
ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
|
|
213 |
ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
|
|
214 |
ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
|
|
215 |
|
|
216 |
/*
|
|
217 |
* Fuse the endpoints; we perform further checks against both
|
|
218 |
* tcp endpoints to ensure that a fusion is allowed to happen.
|
|
219 |
* In particular we bail out for non-simple TCP/IP or if IPsec/
|
|
220 |
* IPQoS policy exists.
|
|
221 |
*/
|
|
222 |
if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
|
|
223 |
!TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
|
|
224 |
!IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
|
|
225 |
mblk_t *mp;
|
|
226 |
struct stroptions *stropt;
|
|
227 |
queue_t *peer_rq = peer_tcp->tcp_rq;
|
|
228 |
|
|
229 |
ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
|
|
230 |
ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
|
|
231 |
ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
|
|
232 |
|
|
233 |
/*
|
|
234 |
* We need to drain data on both endpoints during unfuse.
|
|
235 |
* If we need to send up SIGURG at the time of draining,
|
|
236 |
* we want to be sure that an mblk is readily available.
|
|
237 |
* This is why we pre-allocate the M_PCSIG mblks for both
|
|
238 |
* endpoints which will only be used during/after unfuse.
|
|
239 |
*/
|
|
240 |
if ((mp = allocb(1, BPRI_HI)) == NULL)
|
|
241 |
goto failed;
|
|
242 |
|
|
243 |
tcp->tcp_fused_sigurg_mp = mp;
|
|
244 |
|
|
245 |
if ((mp = allocb(1, BPRI_HI)) == NULL)
|
|
246 |
goto failed;
|
|
247 |
|
|
248 |
peer_tcp->tcp_fused_sigurg_mp = mp;
|
|
249 |
|
|
250 |
/* Allocate M_SETOPTS mblk */
|
|
251 |
if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL)
|
|
252 |
goto failed;
|
|
253 |
|
|
254 |
/* Fuse both endpoints */
|
|
255 |
peer_tcp->tcp_loopback_peer = tcp;
|
|
256 |
tcp->tcp_loopback_peer = peer_tcp;
|
|
257 |
peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
|
|
258 |
|
|
259 |
/*
|
|
260 |
* We never use regular tcp paths in fusion and should
|
|
261 |
* therefore clear tcp_unsent on both endpoints. Having
|
|
262 |
* them set to non-zero values means asking for trouble
|
|
263 |
* especially after unfuse, where we may end up sending
|
|
264 |
* through regular tcp paths which expect xmit_list and
|
|
265 |
* friends to be correctly setup.
|
|
266 |
*/
|
|
267 |
peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
|
|
268 |
|
|
269 |
tcp_timers_stop(tcp);
|
|
270 |
tcp_timers_stop(peer_tcp);
|
|
271 |
|
|
272 |
/*
|
|
273 |
* At this point we are a detached eager tcp and therefore
|
|
274 |
* don't have a queue assigned to us until accept happens.
|
|
275 |
* In the mean time the peer endpoint may immediately send
|
|
276 |
* us data as soon as fusion is finished, and we need to be
|
|
277 |
* able to flow control it in case it sends down huge amount
|
|
278 |
* of data while we're still detached. To prevent that we
|
|
279 |
* inherit the listener's q_hiwat value; this is temporary
|
|
280 |
* since we'll repeat the process in tcp_accept_finish().
|
|
281 |
*/
|
|
282 |
(void) tcp_fuse_set_rcv_hiwat(tcp,
|
|
283 |
tcp->tcp_saved_listener->tcp_rq->q_hiwat);
|
|
284 |
|
|
285 |
/*
|
|
286 |
* Set the stream head's write offset value to zero since we
|
|
287 |
* won't be needing any room for TCP/IP headers; tell it to
|
|
288 |
* not break up the writes (this would reduce the amount of
|
|
289 |
* work done by kmem); and configure our receive buffer.
|
|
290 |
* Note that we can only do this for the active connect tcp
|
|
291 |
* since our eager is still detached; it will be dealt with
|
|
292 |
* later in tcp_accept_finish().
|
|
293 |
*/
|
|
294 |
DB_TYPE(mp) = M_SETOPTS;
|
|
295 |
mp->b_wptr += sizeof (*stropt);
|
|
296 |
|
|
297 |
stropt = (struct stroptions *)mp->b_rptr;
|
|
298 |
stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
|
|
299 |
stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
|
|
300 |
stropt->so_wroff = 0;
|
|
301 |
|
|
302 |
/*
|
|
303 |
* Record the stream head's high water mark for
|
|
304 |
* peer endpoint; this is used for flow-control
|
|
305 |
* purposes in tcp_fuse_output().
|
|
306 |
*/
|
|
307 |
stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp,
|
|
308 |
peer_rq->q_hiwat);
|
|
309 |
|
|
310 |
/* Send the options up */
|
|
311 |
putnext(peer_rq, mp);
|
|
312 |
} else {
|
|
313 |
TCP_STAT(tcp_fusion_unqualified);
|
|
314 |
}
|
|
315 |
CONN_DEC_REF(peer_connp);
|
|
316 |
return;
|
|
317 |
|
|
318 |
failed:
|
|
319 |
if (tcp->tcp_fused_sigurg_mp != NULL) {
|
|
320 |
freeb(tcp->tcp_fused_sigurg_mp);
|
|
321 |
tcp->tcp_fused_sigurg_mp = NULL;
|
|
322 |
}
|
|
323 |
if (peer_tcp->tcp_fused_sigurg_mp != NULL) {
|
|
324 |
freeb(peer_tcp->tcp_fused_sigurg_mp);
|
|
325 |
peer_tcp->tcp_fused_sigurg_mp = NULL;
|
|
326 |
}
|
|
327 |
CONN_DEC_REF(peer_connp);
|
|
328 |
}
|
|
329 |
|
|
330 |
/*
|
|
331 |
* Unfuse a previously-fused pair of tcp loopback endpoints.
|
|
332 |
*/
|
|
333 |
void
|
|
334 |
tcp_unfuse(tcp_t *tcp)
|
|
335 |
{
|
|
336 |
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
|
|
337 |
|
|
338 |
ASSERT(tcp->tcp_fused && peer_tcp != NULL);
|
|
339 |
ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
|
|
340 |
ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
|
|
341 |
ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
|
|
342 |
ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
|
|
343 |
ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
|
|
344 |
|
|
345 |
/*
|
|
346 |
* We disable synchronous streams, drain any queued data and
|
|
347 |
* clear tcp_direct_sockfs. The synchronous streams entry
|
|
348 |
* points will become no-ops after this point.
|
|
349 |
*/
|
|
350 |
tcp_fuse_disable_pair(tcp, B_TRUE);
|
|
351 |
|
|
352 |
/*
|
|
353 |
* Update th_seq and th_ack in the header template
|
|
354 |
*/
|
|
355 |
U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
|
|
356 |
U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
|
|
357 |
U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
|
|
358 |
U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
|
|
359 |
|
|
360 |
/* Unfuse the endpoints */
|
|
361 |
peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
|
|
362 |
peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
|
|
363 |
}
|
|
364 |
|
|
365 |
/*
|
|
366 |
* Fusion output routine for urgent data. This routine is called by
|
|
367 |
* tcp_fuse_output() for handling non-M_DATA mblks.
|
|
368 |
*/
|
|
369 |
void
|
|
370 |
tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
|
|
371 |
{
|
|
372 |
mblk_t *mp1;
|
|
373 |
struct T_exdata_ind *tei;
|
|
374 |
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
|
|
375 |
mblk_t *head, *prev_head = NULL;
|
|
376 |
|
|
377 |
ASSERT(tcp->tcp_fused);
|
|
378 |
ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
|
|
379 |
ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
|
|
380 |
ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
|
|
381 |
ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
|
|
382 |
|
|
383 |
/*
|
|
384 |
* Urgent data arrives in the form of T_EXDATA_REQ from above.
|
|
385 |
* Each occurence denotes a new urgent pointer. For each new
|
|
386 |
* urgent pointer we signal (SIGURG) the receiving app to indicate
|
|
387 |
* that it needs to go into urgent mode. This is similar to the
|
|
388 |
* urgent data handling in the regular tcp. We don't need to keep
|
|
389 |
* track of where the urgent pointer is, because each T_EXDATA_REQ
|
|
390 |
* "advances" the urgent pointer for us.
|
|
391 |
*
|
|
392 |
* The actual urgent data carried by T_EXDATA_REQ is then prepended
|
|
393 |
* by a T_EXDATA_IND before being enqueued behind any existing data
|
|
394 |
* destined for the receiving app. There is only a single urgent
|
|
395 |
* pointer (out-of-band mark) for a given tcp. If the new urgent
|
|
396 |
* data arrives before the receiving app reads some existing urgent
|
|
397 |
* data, the previous marker is lost. This behavior is emulated
|
|
398 |
* accordingly below, by removing any existing T_EXDATA_IND messages
|
|
399 |
* and essentially converting old urgent data into non-urgent.
|
|
400 |
*/
|
|
401 |
ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
|
|
402 |
/* Let sender get out of urgent mode */
|
|
403 |
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
|
|
404 |
|
|
405 |
/*
|
|
406 |
* This flag indicates that a signal needs to be sent up.
|
|
407 |
* This flag will only get cleared once SIGURG is delivered and
|
|
408 |
* is not affected by the tcp_fused flag -- delivery will still
|
|
409 |
* happen even after an endpoint is unfused, to handle the case
|
|
410 |
* where the sending endpoint immediately closes/unfuses after
|
|
411 |
* sending urgent data and the accept is not yet finished.
|
|
412 |
*/
|
|
413 |
peer_tcp->tcp_fused_sigurg = B_TRUE;
|
|
414 |
|
|
415 |
/* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
|
|
416 |
DB_TYPE(mp) = M_PROTO;
|
|
417 |
tei = (struct T_exdata_ind *)mp->b_rptr;
|
|
418 |
tei->PRIM_type = T_EXDATA_IND;
|
|
419 |
tei->MORE_flag = 0;
|
|
420 |
mp->b_wptr = (uchar_t *)&tei[1];
|
|
421 |
|
|
422 |
TCP_STAT(tcp_fusion_urg);
|
|
423 |
BUMP_MIB(&tcp_mib, tcpOutUrg);
|
|
424 |
|
|
425 |
head = peer_tcp->tcp_rcv_list;
|
|
426 |
while (head != NULL) {
|
|
427 |
/*
|
|
428 |
* Remove existing T_EXDATA_IND, keep the data which follows
|
|
429 |
* it and relink our list. Note that we don't modify the
|
|
430 |
* tcp_rcv_last_tail since it never points to T_EXDATA_IND.
|
|
431 |
*/
|
|
432 |
if (DB_TYPE(head) != M_DATA) {
|
|
433 |
mp1 = head;
|
|
434 |
|
|
435 |
ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
|
|
436 |
head = mp1->b_cont;
|
|
437 |
mp1->b_cont = NULL;
|
|
438 |
head->b_next = mp1->b_next;
|
|
439 |
mp1->b_next = NULL;
|
|
440 |
if (prev_head != NULL)
|
|
441 |
prev_head->b_next = head;
|
|
442 |
if (peer_tcp->tcp_rcv_list == mp1)
|
|
443 |
peer_tcp->tcp_rcv_list = head;
|
|
444 |
if (peer_tcp->tcp_rcv_last_head == mp1)
|
|
445 |
peer_tcp->tcp_rcv_last_head = head;
|
|
446 |
freeb(mp1);
|
|
447 |
}
|
|
448 |
prev_head = head;
|
|
449 |
head = head->b_next;
|
|
450 |
}
|
|
451 |
}
|
|
452 |
|
|
453 |
/*
|
|
454 |
* Fusion output routine, called by tcp_output() and tcp_wput_proto().
|
|
455 |
*/
|
|
456 |
boolean_t
|
|
457 |
tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
|
|
458 |
{
|
|
459 |
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
|
|
460 |
queue_t *peer_rq;
|
|
461 |
uint_t max_unread;
|
|
462 |
boolean_t flow_stopped;
|
|
463 |
boolean_t urgent = (DB_TYPE(mp) != M_DATA);
|
|
464 |
|
|
465 |
ASSERT(tcp->tcp_fused);
|
|
466 |
ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
|
|
467 |
ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
|
|
468 |
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
|
|
469 |
DB_TYPE(mp) == M_PCPROTO);
|
|
470 |
|
|
471 |
peer_rq = peer_tcp->tcp_rq;
|
|
472 |
max_unread = peer_tcp->tcp_fuse_rcv_unread_hiwater;
|
|
473 |
|
|
474 |
/* If this connection requires IP, unfuse and use regular path */
|
|
475 |
if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
|
|
476 |
IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
|
|
477 |
TCP_STAT(tcp_fusion_aborted);
|
|
478 |
tcp_unfuse(tcp);
|
|
479 |
return (B_FALSE);
|
|
480 |
}
|
|
481 |
|
|
482 |
if (send_size == 0) {
|
|
483 |
freemsg(mp);
|
|
484 |
return (B_TRUE);
|
|
485 |
}
|
|
486 |
|
|
487 |
/*
|
|
488 |
* Handle urgent data; we either send up SIGURG to the peer now
|
|
489 |
* or do it later when we drain, in case the peer is detached
|
|
490 |
* or if we're short of memory for M_PCSIG mblk.
|
|
491 |
*/
|
|
492 |
if (urgent) {
|
|
493 |
/*
|
|
494 |
* We stop synchronous streams when we have urgent data
|
|
495 |
* queued to prevent tcp_fuse_rrw() from pulling it. If
|
|
496 |
* for some reasons the urgent data can't be delivered
|
|
497 |
* below, synchronous streams will remain stopped until
|
|
498 |
* someone drains the tcp_rcv_list.
|
|
499 |
*/
|
|
500 |
TCP_FUSE_SYNCSTR_STOP(peer_tcp);
|
|
501 |
tcp_fuse_output_urg(tcp, mp);
|
|
502 |
}
|
|
503 |
|
|
504 |
mutex_enter(&peer_tcp->tcp_fuse_lock);
|
|
505 |
/*
|
|
506 |
* Wake up and signal the peer; it is okay to do this before
|
|
507 |
* enqueueing because we are holding the lock. One of the
|
|
508 |
* advantages of synchronous streams is the ability for us to
|
|
509 |
* find out when the application performs a read on the socket,
|
|
510 |
* by way of tcp_fuse_rrw() entry point being called. Every
|
|
511 |
* data that gets enqueued onto the receiver is treated as if
|
|
512 |
* it has arrived at the receiving endpoint, thus generating
|
|
513 |
* SIGPOLL/SIGIO for asynchronous socket just as in the strrput()
|
|
514 |
* case. However, we only wake up the application when necessary,
|
|
515 |
* i.e. during the first enqueue. When tcp_fuse_rrw() is called
|
|
516 |
* it will send everything upstream.
|
|
517 |
*/
|
|
518 |
if (peer_tcp->tcp_direct_sockfs && !urgent &&
|
|
519 |
!TCP_IS_DETACHED(peer_tcp)) {
|
|
520 |
if (peer_tcp->tcp_rcv_list == NULL)
|
|
521 |
STR_WAKEUP_SET(STREAM(peer_tcp->tcp_rq));
|
|
522 |
/* Update poll events and send SIGPOLL/SIGIO if necessary */
|
|
523 |
STR_SENDSIG(STREAM(peer_tcp->tcp_rq));
|
|
524 |
}
|
|
525 |
|
|
526 |
/*
|
|
527 |
* Enqueue data into the peer's receive list; we may or may not
|
|
528 |
* drain the contents depending on the conditions below.
|
|
529 |
*/
|
|
530 |
tcp_rcv_enqueue(peer_tcp, mp, send_size);
|
|
531 |
|
|
532 |
/* In case it wrapped around and also to keep it constant */
|
|
533 |
peer_tcp->tcp_rwnd += send_size;
|
|
534 |
|
|
535 |
/*
|
|
536 |
* Exercise flow-control when needed; we will get back-enabled
|
|
537 |
* in either tcp_accept_finish(), tcp_unfuse(), or tcp_fuse_rrw().
|
|
538 |
* If tcp_direct_sockfs is on or if the peer endpoint is detached,
|
|
539 |
* we emulate streams flow control by checking the peer's queue
|
|
540 |
* size and high water mark; otherwise we simply use canputnext()
|
|
541 |
* to decide if we need to stop our flow.
|
|
542 |
*
|
|
543 |
* The outstanding unread data block check does not apply for a
|
|
544 |
* detached receiver; this is to avoid unnecessary blocking of the
|
|
545 |
* sender while the accept is currently in progress and is quite
|
|
546 |
* similar to the regular tcp.
|
|
547 |
*/
|
|
548 |
if (TCP_IS_DETACHED(peer_tcp) || max_unread == 0)
|
|
549 |
max_unread = UINT_MAX;
|
|
550 |
|
|
551 |
flow_stopped = tcp->tcp_flow_stopped;
|
|
552 |
if (!flow_stopped &&
|
|
553 |
(((peer_tcp->tcp_direct_sockfs || TCP_IS_DETACHED(peer_tcp)) &&
|
|
554 |
(peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater ||
|
|
555 |
++peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) ||
|
|
556 |
(!peer_tcp->tcp_direct_sockfs &&
|
|
557 |
!TCP_IS_DETACHED(peer_tcp) && !canputnext(peer_tcp->tcp_rq)))) {
|
|
558 |
tcp_setqfull(tcp);
|
|
559 |
flow_stopped = B_TRUE;
|
|
560 |
TCP_STAT(tcp_fusion_flowctl);
|
|
561 |
DTRACE_PROBE4(tcp__fuse__output__flowctl, tcp_t *, tcp,
|
|
562 |
uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt,
|
|
563 |
uint_t, peer_tcp->tcp_fuse_rcv_unread_cnt);
|
|
564 |
} else if (flow_stopped &&
|
|
565 |
TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
|
|
566 |
tcp_clrqfull(tcp);
|
|
567 |
}
|
|
568 |
|
|
569 |
loopback_packets++;
|
|
570 |
tcp->tcp_last_sent_len = send_size;
|
|
571 |
|
|
572 |
/* Need to adjust the following SNMP MIB-related variables */
|
|
573 |
tcp->tcp_snxt += send_size;
|
|
574 |
tcp->tcp_suna = tcp->tcp_snxt;
|
|
575 |
peer_tcp->tcp_rnxt += send_size;
|
|
576 |
peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
|
|
577 |
|
|
578 |
BUMP_MIB(&tcp_mib, tcpOutDataSegs);
|
|
579 |
UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
|
|
580 |
|
|
581 |
BUMP_MIB(&tcp_mib, tcpInSegs);
|
|
582 |
BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
|
|
583 |
UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
|
|
584 |
|
|
585 |
BUMP_LOCAL(tcp->tcp_obsegs);
|
|
586 |
BUMP_LOCAL(peer_tcp->tcp_ibsegs);
|
|
587 |
|
|
588 |
mutex_exit(&peer_tcp->tcp_fuse_lock);
|
|
589 |
|
|
590 |
DTRACE_PROBE2(tcp__fuse__output, tcp_t *, tcp, uint_t, send_size);
|
|
591 |
|
|
592 |
if (!TCP_IS_DETACHED(peer_tcp)) {
|
|
593 |
/*
|
|
594 |
* Drain the peer's receive queue it has urgent data or if
|
|
595 |
* we're not flow-controlled. There is no need for draining
|
|
596 |
* normal data when tcp_direct_sockfs is on because the peer
|
|
597 |
* will pull the data via tcp_fuse_rrw().
|
|
598 |
*/
|
|
599 |
if (urgent || (!flow_stopped && !peer_tcp->tcp_direct_sockfs)) {
|
|
600 |
ASSERT(peer_tcp->tcp_rcv_list != NULL);
|
|
601 |
(void) tcp_fuse_rcv_drain(peer_rq, peer_tcp, NULL);
|
|
602 |
/*
|
|
603 |
* If synchronous streams was stopped above due
|
|
604 |
* to the presence of urgent data, re-enable it.
|
|
605 |
*/
|
|
606 |
if (urgent)
|
|
607 |
TCP_FUSE_SYNCSTR_RESUME(peer_tcp);
|
|
608 |
}
|
|
609 |
}
|
|
610 |
return (B_TRUE);
|
|
611 |
}
|
|
612 |
|
|
613 |
/*
|
|
614 |
* This routine gets called to deliver data upstream on a fused or
|
|
615 |
* previously fused tcp loopback endpoint; the latter happens only
|
|
616 |
* when there is a pending SIGURG signal plus urgent data that can't
|
|
617 |
* be sent upstream in the past.
|
|
618 |
*/
|
|
619 |
boolean_t
|
|
620 |
tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
|
|
621 |
{
|
|
622 |
mblk_t *mp;
|
|
623 |
#ifdef DEBUG
|
|
624 |
uint_t cnt = 0;
|
|
625 |
#endif
|
|
626 |
|
|
627 |
ASSERT(tcp->tcp_loopback);
|
|
628 |
ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
|
|
629 |
ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
|
|
630 |
ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
|
|
631 |
|
|
632 |
/* No need for the push timer now, in case it was scheduled */
|
|
633 |
if (tcp->tcp_push_tid != 0) {
|
|
634 |
(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
|
|
635 |
tcp->tcp_push_tid = 0;
|
|
636 |
}
|
|
637 |
/*
|
|
638 |
* If there's urgent data sitting in receive list and we didn't
|
|
639 |
* get a chance to send up a SIGURG signal, make sure we send
|
|
640 |
* it first before draining in order to ensure that SIOCATMARK
|
|
641 |
* works properly.
|
|
642 |
*/
|
|
643 |
if (tcp->tcp_fused_sigurg) {
|
|
644 |
/*
|
|
645 |
* sigurg_mpp is normally NULL, i.e. when we're still
|
|
646 |
* fused and didn't get here because of tcp_unfuse().
|
|
647 |
* In this case try hard to allocate the M_PCSIG mblk.
|
|
648 |
*/
|
|
649 |
if (sigurg_mpp == NULL &&
|
|
650 |
(mp = allocb(1, BPRI_HI)) == NULL &&
|
|
651 |
(mp = allocb_tryhard(1)) == NULL) {
|
|
652 |
/* Alloc failed; try again next time */
|
|
653 |
tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
|
|
654 |
MSEC_TO_TICK(tcp_push_timer_interval));
|
|
655 |
return (B_TRUE);
|
|
656 |
} else if (sigurg_mpp != NULL) {
|
|
657 |
/*
|
|
658 |
* Use the supplied M_PCSIG mblk; it means we're
|
|
659 |
* either unfused or in the process of unfusing,
|
|
660 |
* and the drain must happen now.
|
|
661 |
*/
|
|
662 |
mp = *sigurg_mpp;
|
|
663 |
*sigurg_mpp = NULL;
|
|
664 |
}
|
|
665 |
ASSERT(mp != NULL);
|
|
666 |
|
|
667 |
tcp->tcp_fused_sigurg = B_FALSE;
|
|
668 |
/* Send up the signal */
|
|
669 |
DB_TYPE(mp) = M_PCSIG;
|
|
670 |
*mp->b_wptr++ = (uchar_t)SIGURG;
|
|
671 |
putnext(q, mp);
|
|
672 |
/*
|
|
673 |
* Let the regular tcp_rcv_drain() path handle
|
|
674 |
* draining the data if we're no longer fused.
|
|
675 |
*/
|
|
676 |
if (!tcp->tcp_fused)
|
|
677 |
return (B_FALSE);
|
|
678 |
}
|
|
679 |
|
|
680 |
/*
|
|
681 |
* In the synchronous streams case, we generate SIGPOLL/SIGIO for
|
|
682 |
* each M_DATA that gets enqueued onto the receiver. At this point
|
|
683 |
* we are about to drain any queued data via putnext(). In order
|
|
684 |
* to avoid extraneous signal generation from strrput(), we set
|
|
685 |
* STRGETINPROG flag at the stream head prior to the draining and
|
|
686 |
* restore it afterwards. This masks out signal generation only
|
|
687 |
* for M_DATA messages and does not affect urgent data.
|
|
688 |
*/
|
|
689 |
if (tcp->tcp_direct_sockfs)
|
|
690 |
strrput_sig(q, B_FALSE);
|
|
691 |
|
|
692 |
/* Drain the data */
|
|
693 |
while ((mp = tcp->tcp_rcv_list) != NULL) {
|
|
694 |
tcp->tcp_rcv_list = mp->b_next;
|
|
695 |
mp->b_next = NULL;
|
|
696 |
#ifdef DEBUG
|
|
697 |
cnt += msgdsize(mp);
|
|
698 |
#endif
|
|
699 |
putnext(q, mp);
|
|
700 |
TCP_STAT(tcp_fusion_putnext);
|
|
701 |
}
|
|
702 |
|
|
703 |
if (tcp->tcp_direct_sockfs)
|
|
704 |
strrput_sig(q, B_TRUE);
|
|
705 |
|
|
706 |
ASSERT(cnt == tcp->tcp_rcv_cnt);
|
|
707 |
tcp->tcp_rcv_last_head = NULL;
|
|
708 |
tcp->tcp_rcv_last_tail = NULL;
|
|
709 |
tcp->tcp_rcv_cnt = 0;
|
|
710 |
tcp->tcp_fuse_rcv_unread_cnt = 0;
|
|
711 |
tcp->tcp_rwnd = q->q_hiwat;
|
|
712 |
|
|
713 |
return (B_TRUE);
|
|
714 |
}
|
|
715 |
|
|
716 |
/*
|
|
717 |
* Synchronous stream entry point for sockfs to retrieve
|
|
718 |
* data directly from tcp_rcv_list.
|
|
719 |
*/
|
|
720 |
int
|
|
721 |
tcp_fuse_rrw(queue_t *q, struiod_t *dp)
|
|
722 |
{
|
|
723 |
tcp_t *tcp = Q_TO_CONN(q)->conn_tcp;
|
|
724 |
mblk_t *mp;
|
|
725 |
|
|
726 |
mutex_enter(&tcp->tcp_fuse_lock);
|
|
727 |
/*
|
|
728 |
* If someone had turned off tcp_direct_sockfs or if synchronous
|
|
729 |
* streams is temporarily disabled, we return EBUSY. This causes
|
|
730 |
* strget() to dequeue data from the stream head instead.
|
|
731 |
*/
|
|
732 |
if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped) {
|
|
733 |
mutex_exit(&tcp->tcp_fuse_lock);
|
|
734 |
TCP_STAT(tcp_fusion_rrw_busy);
|
|
735 |
return (EBUSY);
|
|
736 |
}
|
|
737 |
|
|
738 |
if ((mp = tcp->tcp_rcv_list) != NULL) {
|
|
739 |
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
|
|
740 |
|
|
741 |
DTRACE_PROBE3(tcp__fuse__rrw, tcp_t *, tcp,
|
|
742 |
uint32_t, tcp->tcp_rcv_cnt, ssize_t, dp->d_uio.uio_resid);
|
|
743 |
|
|
744 |
tcp->tcp_rcv_list = NULL;
|
|
745 |
TCP_STAT(tcp_fusion_rrw_msgcnt);
|
|
746 |
|
|
747 |
/*
|
|
748 |
* At this point nothing should be left in tcp_rcv_list.
|
|
749 |
* The only possible case where we would have a chain of
|
|
750 |
* b_next-linked messages is urgent data, but we wouldn't
|
|
751 |
* be here if that's true since urgent data is delivered
|
|
752 |
* via putnext() and synchronous streams is stopped until
|
|
753 |
* tcp_fuse_rcv_drain() is finished.
|
|
754 |
*/
|
|
755 |
ASSERT(DB_TYPE(mp) == M_DATA && mp->b_next == NULL);
|
|
756 |
|
|
757 |
tcp->tcp_rcv_last_head = NULL;
|
|
758 |
tcp->tcp_rcv_last_tail = NULL;
|
|
759 |
tcp->tcp_rcv_cnt = 0;
|
|
760 |
tcp->tcp_fuse_rcv_unread_cnt = 0;
|
|
761 |
|
|
762 |
if (peer_tcp->tcp_flow_stopped) {
|
|
763 |
tcp_clrqfull(peer_tcp);
|
|
764 |
TCP_STAT(tcp_fusion_backenabled);
|
|
765 |
}
|
|
766 |
}
|
|
767 |
|
|
768 |
/*
|
|
769 |
* Either we just dequeued everything or we get here from sockfs
|
|
770 |
* and have nothing to return; in this case clear RSLEEP.
|
|
771 |
*/
|
|
772 |
ASSERT(tcp->tcp_rcv_last_head == NULL);
|
|
773 |
ASSERT(tcp->tcp_rcv_last_tail == NULL);
|
|
774 |
ASSERT(tcp->tcp_rcv_cnt == 0);
|
|
775 |
ASSERT(tcp->tcp_fuse_rcv_unread_cnt == 0);
|
|
776 |
STR_WAKEUP_CLEAR(STREAM(q));
|
|
777 |
|
|
778 |
mutex_exit(&tcp->tcp_fuse_lock);
|
|
779 |
dp->d_mp = mp;
|
|
780 |
return (0);
|
|
781 |
}
|
|
782 |
|
|
783 |
/*
|
|
784 |
* Synchronous stream entry point used by certain ioctls to retrieve
|
|
785 |
* information about or peek into the tcp_rcv_list.
|
|
786 |
*/
|
|
787 |
int
|
|
788 |
tcp_fuse_rinfop(queue_t *q, infod_t *dp)
|
|
789 |
{
|
|
790 |
tcp_t *tcp = Q_TO_CONN(q)->conn_tcp;
|
|
791 |
mblk_t *mp;
|
|
792 |
uint_t cmd = dp->d_cmd;
|
|
793 |
int res = 0;
|
|
794 |
int error = 0;
|
|
795 |
struct stdata *stp = STREAM(q);
|
|
796 |
|
|
797 |
mutex_enter(&tcp->tcp_fuse_lock);
|
|
798 |
/* If shutdown on read has happened, return nothing */
|
|
799 |
mutex_enter(&stp->sd_lock);
|
|
800 |
if (stp->sd_flag & STREOF) {
|
|
801 |
mutex_exit(&stp->sd_lock);
|
|
802 |
goto done;
|
|
803 |
}
|
|
804 |
mutex_exit(&stp->sd_lock);
|
|
805 |
|
|
806 |
/*
|
|
807 |
* It is OK not to return an answer if tcp_rcv_list is
|
|
808 |
* currently not accessible.
|
|
809 |
*/
|
|
810 |
if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped ||
|
|
811 |
(mp = tcp->tcp_rcv_list) == NULL)
|
|
812 |
goto done;
|
|
813 |
|
|
814 |
if (cmd & INFOD_COUNT) {
|
|
815 |
/*
|
|
816 |
* We have at least one message and
|
|
817 |
* could return only one at a time.
|
|
818 |
*/
|
|
819 |
dp->d_count++;
|
|
820 |
res |= INFOD_COUNT;
|
|
821 |
}
|
|
822 |
if (cmd & INFOD_BYTES) {
|
|
823 |
/*
|
|
824 |
* Return size of all data messages.
|
|
825 |
*/
|
|
826 |
dp->d_bytes += tcp->tcp_rcv_cnt;
|
|
827 |
res |= INFOD_BYTES;
|
|
828 |
}
|
|
829 |
if (cmd & INFOD_FIRSTBYTES) {
|
|
830 |
/*
|
|
831 |
* Return size of first data message.
|
|
832 |
*/
|
|
833 |
dp->d_bytes = msgdsize(mp);
|
|
834 |
res |= INFOD_FIRSTBYTES;
|
|
835 |
dp->d_cmd &= ~INFOD_FIRSTBYTES;
|
|
836 |
}
|
|
837 |
if (cmd & INFOD_COPYOUT) {
|
|
838 |
mblk_t *mp1;
|
|
839 |
int n;
|
|
840 |
|
|
841 |
if (DB_TYPE(mp) == M_DATA) {
|
|
842 |
mp1 = mp;
|
|
843 |
} else {
|
|
844 |
mp1 = mp->b_cont;
|
|
845 |
ASSERT(mp1 != NULL);
|
|
846 |
}
|
|
847 |
|
|
848 |
/*
|
|
849 |
* Return data contents of first message.
|
|
850 |
*/
|
|
851 |
ASSERT(DB_TYPE(mp1) == M_DATA);
|
|
852 |
while (mp1 != NULL && dp->d_uiop->uio_resid > 0) {
|
|
853 |
n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1));
|
|
854 |
if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n,
|
|
855 |
UIO_READ, dp->d_uiop)) != 0) {
|
|
856 |
goto done;
|
|
857 |
}
|
|
858 |
mp1 = mp1->b_cont;
|
|
859 |
}
|
|
860 |
res |= INFOD_COPYOUT;
|
|
861 |
dp->d_cmd &= ~INFOD_COPYOUT;
|
|
862 |
}
|
|
863 |
done:
|
|
864 |
mutex_exit(&tcp->tcp_fuse_lock);
|
|
865 |
|
|
866 |
dp->d_res |= res;
|
|
867 |
|
|
868 |
return (error);
|
|
869 |
}
|
|
870 |
|
|
871 |
/*
|
|
872 |
* Enable synchronous streams on a fused tcp loopback endpoint.
|
|
873 |
*/
|
|
874 |
static void
|
|
875 |
tcp_fuse_syncstr_enable(tcp_t *tcp)
|
|
876 |
{
|
|
877 |
queue_t *rq = tcp->tcp_rq;
|
|
878 |
struct stdata *stp = STREAM(rq);
|
|
879 |
|
|
880 |
/* We can only enable synchronous streams for sockfs mode */
|
|
881 |
tcp->tcp_direct_sockfs = tcp->tcp_issocket && do_tcp_direct_sockfs;
|
|
882 |
|
|
883 |
if (!tcp->tcp_direct_sockfs)
|
|
884 |
return;
|
|
885 |
|
|
886 |
mutex_enter(&stp->sd_lock);
|
|
887 |
mutex_enter(QLOCK(rq));
|
|
888 |
|
|
889 |
/*
|
|
890 |
* We replace our q_qinfo with one that has the qi_rwp entry point.
|
|
891 |
* Clear SR_SIGALLDATA because we generate the equivalent signal(s)
|
|
892 |
* for every enqueued data in tcp_fuse_output().
|
|
893 |
*/
|
|
894 |
rq->q_qinfo = &tcp_loopback_rinit;
|
|
895 |
rq->q_struiot = tcp_loopback_rinit.qi_struiot;
|
|
896 |
stp->sd_struiordq = rq;
|
|
897 |
stp->sd_rput_opt &= ~SR_SIGALLDATA;
|
|
898 |
|
|
899 |
mutex_exit(QLOCK(rq));
|
|
900 |
mutex_exit(&stp->sd_lock);
|
|
901 |
}
|
|
902 |
|
|
903 |
/*
|
|
904 |
* Disable synchronous streams on a fused tcp loopback endpoint.
|
|
905 |
*/
|
|
906 |
static void
|
|
907 |
tcp_fuse_syncstr_disable(tcp_t *tcp)
|
|
908 |
{
|
|
909 |
queue_t *rq = tcp->tcp_rq;
|
|
910 |
struct stdata *stp = STREAM(rq);
|
|
911 |
|
|
912 |
if (!tcp->tcp_direct_sockfs)
|
|
913 |
return;
|
|
914 |
|
|
915 |
mutex_enter(&stp->sd_lock);
|
|
916 |
mutex_enter(QLOCK(rq));
|
|
917 |
|
|
918 |
/*
|
|
919 |
* Reset q_qinfo to point to the default tcp entry points.
|
|
920 |
* Also restore SR_SIGALLDATA so that strrput() can generate
|
|
921 |
* the signals again for future M_DATA messages.
|
|
922 |
*/
|
|
923 |
rq->q_qinfo = &tcp_rinit;
|
|
924 |
rq->q_struiot = tcp_rinit.qi_struiot;
|
|
925 |
stp->sd_struiordq = NULL;
|
|
926 |
stp->sd_rput_opt |= SR_SIGALLDATA;
|
|
927 |
tcp->tcp_direct_sockfs = B_FALSE;
|
|
928 |
|
|
929 |
mutex_exit(QLOCK(rq));
|
|
930 |
mutex_exit(&stp->sd_lock);
|
|
931 |
}
|
|
932 |
|
|
933 |
/*
|
|
934 |
* Enable synchronous streams on a pair of fused tcp endpoints.
|
|
935 |
*/
|
|
936 |
void
|
|
937 |
tcp_fuse_syncstr_enable_pair(tcp_t *tcp)
|
|
938 |
{
|
|
939 |
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
|
|
940 |
|
|
941 |
ASSERT(tcp->tcp_fused);
|
|
942 |
ASSERT(peer_tcp != NULL);
|
|
943 |
|
|
944 |
tcp_fuse_syncstr_enable(tcp);
|
|
945 |
tcp_fuse_syncstr_enable(peer_tcp);
|
|
946 |
}
|
|
947 |
|
|
948 |
/*
|
|
949 |
* Allow or disallow signals to be generated by strrput().
|
|
950 |
*/
|
|
951 |
static void
|
|
952 |
strrput_sig(queue_t *q, boolean_t on)
|
|
953 |
{
|
|
954 |
struct stdata *stp = STREAM(q);
|
|
955 |
|
|
956 |
mutex_enter(&stp->sd_lock);
|
|
957 |
if (on)
|
|
958 |
stp->sd_flag &= ~STRGETINPROG;
|
|
959 |
else
|
|
960 |
stp->sd_flag |= STRGETINPROG;
|
|
961 |
mutex_exit(&stp->sd_lock);
|
|
962 |
}
|
|
963 |
|
|
964 |
/*
|
|
965 |
* Disable synchronous streams on a pair of fused tcp endpoints and drain
|
|
966 |
* any queued data; called either during unfuse or upon transitioning from
|
|
967 |
* a socket to a stream endpoint due to _SIOCSOCKFALLBACK.
|
|
968 |
*/
|
|
969 |
void
|
|
970 |
tcp_fuse_disable_pair(tcp_t *tcp, boolean_t unfusing)
|
|
971 |
{
|
|
972 |
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
|
|
973 |
|
|
974 |
ASSERT(tcp->tcp_fused);
|
|
975 |
ASSERT(peer_tcp != NULL);
|
|
976 |
|
|
977 |
/*
|
|
978 |
* We need to prevent tcp_fuse_rrw() from entering before
|
|
979 |
* we can disable synchronous streams.
|
|
980 |
*/
|
|
981 |
TCP_FUSE_SYNCSTR_STOP(tcp);
|
|
982 |
TCP_FUSE_SYNCSTR_STOP(peer_tcp);
|
|
983 |
|
|
984 |
/*
|
|
985 |
* Drain any pending data; the detached check is needed because
|
|
986 |
* we may be called as a result of a tcp_unfuse() triggered by
|
|
987 |
* tcp_fuse_output(). Note that in case of a detached tcp, the
|
|
988 |
* draining will happen later after the tcp is unfused. For non-
|
|
989 |
* urgent data, this can be handled by the regular tcp_rcv_drain().
|
|
990 |
* If we have urgent data sitting in the receive list, we will
|
|
991 |
* need to send up a SIGURG signal first before draining the data.
|
|
992 |
* All of these will be handled by the code in tcp_fuse_rcv_drain()
|
|
993 |
* when called from tcp_rcv_drain().
|
|
994 |
*/
|
|
995 |
if (!TCP_IS_DETACHED(tcp)) {
|
|
996 |
(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
|
|
997 |
(unfusing ? &tcp->tcp_fused_sigurg_mp : NULL));
|
|
998 |
}
|
|
999 |
if (!TCP_IS_DETACHED(peer_tcp)) {
|
|
1000 |
(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
|
|
1001 |
(unfusing ? &peer_tcp->tcp_fused_sigurg_mp : NULL));
|
|
1002 |
}
|
|
1003 |
|
|
1004 |
/* Lift up any flow-control conditions */
|
|
1005 |
if (tcp->tcp_flow_stopped) {
|
|
1006 |
tcp_clrqfull(tcp);
|
|
1007 |
TCP_STAT(tcp_fusion_backenabled);
|
|
1008 |
}
|
|
1009 |
if (peer_tcp->tcp_flow_stopped) {
|
|
1010 |
tcp_clrqfull(peer_tcp);
|
|
1011 |
TCP_STAT(tcp_fusion_backenabled);
|
|
1012 |
}
|
|
1013 |
|
|
1014 |
/* Disable synchronous streams */
|
|
1015 |
tcp_fuse_syncstr_disable(tcp);
|
|
1016 |
tcp_fuse_syncstr_disable(peer_tcp);
|
|
1017 |
}
|
|
1018 |
|
|
1019 |
/*
|
|
1020 |
* Calculate the size of receive buffer for a fused tcp endpoint.
|
|
1021 |
*/
|
|
1022 |
size_t
|
|
1023 |
tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
|
|
1024 |
{
|
|
1025 |
ASSERT(tcp->tcp_fused);
|
|
1026 |
|
|
1027 |
/* Ensure that value is within the maximum upper bound */
|
|
1028 |
if (rwnd > tcp_max_buf)
|
|
1029 |
rwnd = tcp_max_buf;
|
|
1030 |
|
|
1031 |
/* Obey the absolute minimum tcp receive high water mark */
|
|
1032 |
if (rwnd < tcp_sth_rcv_hiwat)
|
|
1033 |
rwnd = tcp_sth_rcv_hiwat;
|
|
1034 |
|
|
1035 |
/*
|
|
1036 |
* Round up to system page size in case SO_RCVBUF is modified
|
|
1037 |
* after SO_SNDBUF; the latter is also similarly rounded up.
|
|
1038 |
*/
|
|
1039 |
rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t);
|
|
1040 |
tcp->tcp_fuse_rcv_hiwater = rwnd;
|
|
1041 |
return (rwnd);
|
|
1042 |
}
|
|
1043 |
|
|
1044 |
/*
|
|
1045 |
* Calculate the maximum outstanding unread data block for a fused tcp endpoint.
|
|
1046 |
*/
|
|
1047 |
int
|
|
1048 |
tcp_fuse_maxpsz_set(tcp_t *tcp)
|
|
1049 |
{
|
|
1050 |
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
|
|
1051 |
uint_t sndbuf = tcp->tcp_xmit_hiwater;
|
|
1052 |
uint_t maxpsz = sndbuf;
|
|
1053 |
|
|
1054 |
ASSERT(tcp->tcp_fused);
|
|
1055 |
ASSERT(peer_tcp != NULL);
|
|
1056 |
ASSERT(peer_tcp->tcp_fuse_rcv_hiwater != 0);
|
|
1057 |
/*
|
|
1058 |
* In the fused loopback case, we want the stream head to split
|
|
1059 |
* up larger writes into smaller chunks for a more accurate flow-
|
|
1060 |
* control accounting. Our maxpsz is half of the sender's send
|
|
1061 |
* buffer or the receiver's receive buffer, whichever is smaller.
|
|
1062 |
* We round up the buffer to system page size due to the lack of
|
|
1063 |
* TCP MSS concept in Fusion.
|
|
1064 |
*/
|
|
1065 |
if (maxpsz > peer_tcp->tcp_fuse_rcv_hiwater)
|
|
1066 |
maxpsz = peer_tcp->tcp_fuse_rcv_hiwater;
|
|
1067 |
maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
|
|
1068 |
|
|
1069 |
/*
|
|
1070 |
* Calculate the peer's limit for the number of outstanding unread
|
|
1071 |
* data block. This is the amount of data blocks that are allowed
|
|
1072 |
* to reside in the receiver's queue before the sender gets flow
|
|
1073 |
* controlled. It is used only in the synchronous streams mode as
|
|
1074 |
* a way to throttle the sender when it performs consecutive writes
|
|
1075 |
* faster than can be read. The value is derived from SO_SNDBUF in
|
|
1076 |
* order to give the sender some control; we divide it with a large
|
|
1077 |
* value (16KB) to produce a fairly low initial limit.
|
|
1078 |
*/
|
|
1079 |
if (tcp_fusion_rcv_unread_min == 0) {
|
|
1080 |
/* A value of 0 means that we disable the check */
|
|
1081 |
peer_tcp->tcp_fuse_rcv_unread_hiwater = 0;
|
|
1082 |
} else {
|
|
1083 |
peer_tcp->tcp_fuse_rcv_unread_hiwater =
|
|
1084 |
MAX(sndbuf >> 14, tcp_fusion_rcv_unread_min);
|
|
1085 |
}
|
|
1086 |
return (maxpsz);
|
|
1087 |
}
|