author | Mark J. Nelson <Mark.J.Nelson@Sun.COM> |
Wed, 06 Aug 2008 16:29:39 -0600 | |
changeset 7298 | b69e27387f74 |
parent 6812 | febeba71273d |
permissions | -rw-r--r-- |
0 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
6812 | 5 |
* Common Development and Distribution License (the "License"). |
6 |
* You may not use this file except in compliance with the License. |
|
0 | 7 |
* |
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
6812 | 21 |
|
0 | 22 |
/* |
6812 | 23 |
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
24 |
* Use is subject to license terms. |
|
0 | 25 |
*/ |
26 |
||
7298
b69e27387f74
6733918 Teamware has retired, please welcome your new manager, Mercurial
Mark J. Nelson <Mark.J.Nelson@Sun.COM>
parents:
6812
diff
changeset
|
27 |
.file "__align_cpy_4.s" |
0 | 28 |
|
29 |
/* __align_cpy_4(s1, s2, n) |
|
30 |
* |
|
31 |
* Copy 4-byte aligned source to 4-byte aligned target in multiples of 4 bytes. |
|
32 |
* |
|
33 |
* Input: |
|
34 |
* o0 address of target |
|
35 |
* o1 address of source |
|
36 |
* o2 number of bytes to copy (must be a multiple of 4) |
|
37 |
* Output: |
|
38 |
* o0 address of target |
|
39 |
* Caller's registers that have been changed by this function: |
|
40 |
* o1-o5, g1, g5 |
|
41 |
* |
|
42 |
* Note: |
|
43 |
* This helper routine will not be used by any 32-bit compilations. |
|
44 |
* To do so would break binary compatibility with previous versions of |
|
45 |
* Solaris. |
|
46 |
* |
|
47 |
* Assumptions: |
|
48 |
* Source and target addresses are 4-byte aligned. |
|
49 |
* Bytes to be copied are non-overlapping or _exactly_ overlapping. |
|
50 |
* The number of bytes to be copied is a multiple of 4. |
|
51 |
* Call will usually be made with a byte count of more than 4*4 and |
|
52 |
* less than a few hundred bytes. Legal values are 0 to MAX_SIZE_T. |
|
53 |
* |
|
54 |
* Optimization attempt: |
|
55 |
* Reasonable speed for a generic v9. |
|
56 |
*/ |
|
57 |
||
58 |
#include <sys/asm_linkage.h> |
|
59 |
||
60 |
ENTRY(__align_cpy_4) |
|
61 |
brz,pn %o2, .done ! Skip out if no bytes to copy. |
|
62 |
cmp %o0, %o1 |
|
63 |
be,pn %xcc, .done ! Addresses are identical--done. |
|
64 |
and %o0, 7, %o3 ! Is target 8-byte aligned? |
|
65 |
and %o1, 7, %o4 ! Is source 8-byte aligned? |
|
66 |
cmp %o3, %o4 |
|
67 |
bne,pt %icc, .noton8 ! Exactly one of source and target is |
|
68 |
mov %o0, %g1 ! 8-byte aligned. |
|
69 |
brz,pt %o3, .both8 ! Both are 8-byte aligned. |
|
70 |
nop |
|
71 |
||
72 |
ld [%o1], %o3 ! Neither is aligned, so do 4 bytes; |
|
73 |
subcc %o2, 4, %o2 ! then both will be aligned. |
|
74 |
st %o3, [%g1] |
|
75 |
bz,pn %xcc, .done |
|
76 |
add %g1, 4, %g1 |
|
77 |
b .both8 |
|
78 |
add %o1, 4, %o1 |
|
79 |
||
80 |
! Section of code dealing with case where source and target are both 8-byte |
|
81 |
! aligned. Get and store 16 bytes at a time using ldx and stx. |
|
82 |
||
83 |
.align 32 |
|
84 |
.both8: ! Both source and target are aligned. |
|
85 |
cmp %o2, 16 |
|
86 |
bl,a,pn %xcc, .chkwd |
|
87 |
cmp %o2, 8 |
|
88 |
||
89 |
sub %o2, 12, %o2 |
|
90 |
.loop16a: ! Load and store 16 bytes at a time. |
|
91 |
ldx [%o1], %o3 |
|
92 |
ldx [%o1+8], %o4 |
|
93 |
subcc %o2, 16, %o2 |
|
94 |
stx %o3, [%g1] |
|
95 |
stx %o4, [%g1+8] |
|
96 |
add %o1, 16, %o1 |
|
97 |
bg,pt %xcc, .loop16a ! Have at least 16 bytes left. |
|
98 |
add %g1, 16, %g1 |
|
99 |
||
100 |
addcc %o2, 12, %o2 |
|
101 |
bg,a,pt %xcc, .chkwd ! Have some remaining bytes. |
|
102 |
cmp %o2, 8 |
|
103 |
retl |
|
104 |
nop |
|
105 |
||
106 |
.chkwd: |
|
107 |
bl,a,pn %xcc, .wrword ! Only 4 bytes left. |
|
108 |
ld [%o1], %o3 |
|
109 |
||
110 |
ldx [%o1], %o3 ! Have 8 or 12, so do 8. |
|
111 |
stx %o3, [%g1] |
|
112 |
add %o1, 8, %o1 |
|
113 |
add %g1, 8, %g1 |
|
114 |
subcc %o2, 8, %o2 |
|
115 |
bg,a,pn %xcc, .wrword ! Still have four to do. |
|
116 |
ld [%o1], %o3 |
|
117 |
||
118 |
retl |
|
119 |
nop |
|
120 |
||
121 |
.wrword: ! Copy final word. |
|
122 |
st %o3, [%g1] |
|
123 |
||
124 |
.done: |
|
125 |
retl |
|
126 |
nop |
|
127 |
||
128 |
! Section of code where either source or target, but not both, are 8-byte |
|
129 |
! aligned. So, use ld and st instructions rather than trying to copy stuff |
|
130 |
! around in registers. |
|
131 |
||
132 |
.align 32 ! Ultra cache line boundary. |
|
133 |
.noton8: |
|
134 |
add %o1, %o2, %g5 ! Ending address of source. |
|
135 |
andcc %o2, 15, %o3 ! Mod 16 of number of bytes to copy. |
|
136 |
bz,pn %xcc, .loop16 ! Copy odd amounts first, then multiples of 16. |
|
137 |
cmp %o3, 4 |
|
138 |
bz,pn %xcc, .mod4 |
|
139 |
cmp %o3, 8 |
|
140 |
bz,pn %xcc, .mod8 |
|
141 |
cmp %o3, 12 |
|
142 |
bz,pt %xcc, .mod12 |
|
143 |
nop |
|
144 |
illtrap 0 ! Size not valid. |
|
145 |
||
146 |
.mod4: ! Do first 4 bytes, then do multiples of 16. |
|
147 |
lduw [%o1], %o2 |
|
148 |
add %o1, 4, %o1 |
|
149 |
st %o2, [%g1] |
|
150 |
cmp %o1, %g5 |
|
151 |
bl,a,pt %xcc, .loop16 |
|
152 |
add %g1, 4, %g1 |
|
153 |
retl |
|
154 |
nop |
|
155 |
.mod8: ! Do first 8 bytes, then do multiples of 16. |
|
156 |
lduw [%o1], %o2 |
|
157 |
lduw [%o1+4], %o3 |
|
158 |
add %o1, 8, %o1 |
|
159 |
st %o2, [%g1] |
|
160 |
st %o3, [%g1+4] |
|
161 |
cmp %o1, %g5 |
|
162 |
bl,a,pt %xcc, .loop16 |
|
163 |
add %g1, 8, %g1 |
|
164 |
retl |
|
165 |
nop |
|
166 |
.mod12: ! Do first 12 bytes, then do multiples of 16. |
|
167 |
lduw [%o1], %o2 |
|
168 |
lduw [%o1+4], %o3 |
|
169 |
lduw [%o1+8], %o4 |
|
170 |
add %o1, 12, %o1 |
|
171 |
st %o2, [%g1] |
|
172 |
st %o3, [%g1+4] |
|
173 |
st %o4, [%g1+8] |
|
174 |
cmp %o1, %g5 |
|
175 |
bl,a,pt %xcc, .loop16 |
|
176 |
add %g1, 12, %g1 |
|
177 |
retl |
|
178 |
nop |
|
179 |
.align 32 ! Ultra cache line boundary. |
|
180 |
.loop16: ! Do multiples of 16 bytes. |
|
181 |
lduw [%o1], %o2 |
|
182 |
lduw [%o1+4], %o3 |
|
183 |
lduw [%o1+8], %o4 |
|
184 |
lduw [%o1+12], %o5 |
|
185 |
add %o1, 16, %o1 |
|
186 |
st %o2, [%g1] |
|
187 |
st %o3, [%g1+4] |
|
188 |
cmp %o1, %g5 |
|
189 |
st %o4, [%g1+8] |
|
190 |
st %o5, [%g1+12] |
|
191 |
bl,a,pt %xcc, .loop16 |
|
192 |
add %g1, 16,%g1 |
|
193 |
retl ! Target address is already in o0. |
|
194 |
nop |
|
195 |
||
196 |
SET_SIZE(__align_cpy_4) |