|
1 #!/usr/bin/env perl |
|
2 |
|
3 # ==================================================================== |
|
4 # Written by David S. Miller <[email protected]> and Andy Polyakov |
|
5 # <[email protected]>. The module is licensed under 2-clause BSD |
|
6 # license. October 2012. All rights reserved. |
|
7 # ==================================================================== |
|
8 |
|
9 ###################################################################### |
|
10 # AES for SPARC T4. |
|
11 # |
|
12 # AES round instructions complete in 3 cycles and can be issued every |
|
13 # cycle. It means that round calculations should take 4*rounds cycles, |
|
14 # because any given round instruction depends on result of *both* |
|
15 # previous instructions: |
|
16 # |
|
17 # |0 |1 |2 |3 |4 |
|
18 # |01|01|01| |
|
19 # |23|23|23| |
|
20 # |01|01|... |
|
21 # |23|... |
|
22 # |
|
23 # Provided that fxor [with IV] takes 3 cycles to complete, critical |
|
24 # path length for CBC encrypt would be 3+4*rounds, or in other words |
|
25 # it should process one byte in at least (3+4*rounds)/16 cycles. This |
|
26 # estimate doesn't account for "collateral" instructions, such as |
|
27 # fetching input from memory, xor-ing it with zero-round key and |
|
28 # storing the result. Yet, *measured* performance [for data aligned |
|
29 # at 64-bit boundary!] deviates from this equation by less than 0.5%: |
|
30 # |
|
31 # 128-bit key 192- 256- |
|
32 # CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90 |
|
33 # (*) numbers after slash are for |
|
34 # misaligned data. |
|
35 # |
|
36 # Out-of-order execution logic managed to fully overlap "collateral" |
|
37 # instructions with those on critical path. Amazing! |
|
38 # |
|
39 # As with Intel AES-NI, question is if it's possible to improve |
|
40 # performance of parallelizeable modes by interleaving round |
|
41 # instructions. Provided round instruction latency and throughput |
|
42 # optimal interleave factor is 2. But can we expect 2x performance |
|
43 # improvement? Well, as round instructions can be issued one per |
|
44 # cycle, they don't saturate the 2-way issue pipeline and therefore |
|
45 # there is room for "collateral" calculations... Yet, 2x speed-up |
|
46 # over CBC encrypt remains unattaintable: |
|
47 # |
|
48 # 128-bit key 192- 256- |
|
49 # CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61 |
|
50 # CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61 |
|
51 # (*) numbers after slash are for |
|
52 # misaligned data. |
|
53 # |
|
54 # Estimates based on amount of instructions under assumption that |
|
55 # round instructions are not pairable with any other instruction |
|
56 # suggest that latter is the actual case and pipeline runs |
|
57 # underutilized. It should be noted that T4 out-of-order execution |
|
58 # logic is so capable that performance gain from 2x interleave is |
|
59 # not even impressive, ~7-13% over non-interleaved code, largest |
|
60 # for 256-bit keys. |
|
61 |
|
62 # To anchor to something else, software implementation processes |
|
63 # one byte in 29 cycles with 128-bit key on same processor. Intel |
|
64 # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts |
|
65 # in 0.93, naturally with AES-NI. |
|
66 |
|
67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
|
68 push(@INC,"${dir}","${dir}../../perlasm"); |
|
69 require "sparcv9_modes.pl"; |
|
70 |
|
71 &asm_init(@ARGV); |
|
72 |
|
73 $::evp=1; # if $evp is set to 0, script generates module with |
|
74 # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry |
|
75 # points. These however are not fully compatible with openssl/aes.h, |
|
76 # because they expect AES_KEY to be aligned at 64-bit boundary. When |
|
77 # used through EVP, alignment is arranged at EVP layer. Second thing |
|
78 # that is arranged by EVP is at least 32-bit alignment of IV. |
|
79 |
|
80 ###################################################################### |
|
81 # single-round subroutines |
|
82 # |
|
83 { |
|
84 my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); |
|
85 |
|
86 $code.=<<___ if ($::abibits==64); |
|
87 .register %g2,#scratch |
|
88 .register %g3,#scratch |
|
89 |
|
90 ___ |
|
91 $code.=<<___; |
|
92 #include <openssl/fipssyms.h> |
|
93 |
|
94 .text |
|
95 |
|
96 .globl aes_t4_encrypt |
|
97 .align 32 |
|
98 aes_t4_encrypt: |
|
99 andcc $inp, 7, %g1 ! is input aligned? |
|
100 andn $inp, 7, $inp |
|
101 |
|
102 ldx [$key + 0], %g4 |
|
103 ldx [$key + 8], %g5 |
|
104 |
|
105 ldx [$inp + 0], %o4 |
|
106 bz,pt %icc, 1f |
|
107 ldx [$inp + 8], %o5 |
|
108 ldx [$inp + 16], $inp |
|
109 sll %g1, 3, %g1 |
|
110 sub %g0, %g1, %o3 |
|
111 sllx %o4, %g1, %o4 |
|
112 sllx %o5, %g1, %g1 |
|
113 srlx %o5, %o3, %o5 |
|
114 srlx $inp, %o3, %o3 |
|
115 or %o5, %o4, %o4 |
|
116 or %o3, %g1, %o5 |
|
117 1: |
|
118 ld [$key + 240], $rounds |
|
119 ldd [$key + 16], %f12 |
|
120 ldd [$key + 24], %f14 |
|
121 xor %g4, %o4, %o4 |
|
122 xor %g5, %o5, %o5 |
|
123 movxtod %o4, %f0 |
|
124 movxtod %o5, %f2 |
|
125 srl $rounds, 1, $rounds |
|
126 ldd [$key + 32], %f16 |
|
127 sub $rounds, 1, $rounds |
|
128 ldd [$key + 40], %f18 |
|
129 add $key, 48, $key |
|
130 |
|
131 .Lenc: |
|
132 aes_eround01 %f12, %f0, %f2, %f4 |
|
133 aes_eround23 %f14, %f0, %f2, %f2 |
|
134 ldd [$key + 0], %f12 |
|
135 ldd [$key + 8], %f14 |
|
136 sub $rounds,1,$rounds |
|
137 aes_eround01 %f16, %f4, %f2, %f0 |
|
138 aes_eround23 %f18, %f4, %f2, %f2 |
|
139 ldd [$key + 16], %f16 |
|
140 ldd [$key + 24], %f18 |
|
141 brnz,pt $rounds, .Lenc |
|
142 add $key, 32, $key |
|
143 |
|
144 andcc $out, 7, $tmp ! is output aligned? |
|
145 aes_eround01 %f12, %f0, %f2, %f4 |
|
146 aes_eround23 %f14, %f0, %f2, %f2 |
|
147 aes_eround01_l %f16, %f4, %f2, %f0 |
|
148 aes_eround23_l %f18, %f4, %f2, %f2 |
|
149 |
|
150 bnz,pn %icc, 2f |
|
151 nop |
|
152 |
|
153 std %f0, [$out + 0] |
|
154 retl |
|
155 std %f2, [$out + 8] |
|
156 |
|
157 2: alignaddrl $out, %g0, $out |
|
158 mov 0xff, $mask |
|
159 srl $mask, $tmp, $mask |
|
160 |
|
161 faligndata %f0, %f0, %f4 |
|
162 faligndata %f0, %f2, %f6 |
|
163 faligndata %f2, %f2, %f8 |
|
164 |
|
165 stda %f4, [$out + $mask]0xc0 ! partial store |
|
166 std %f6, [$out + 8] |
|
167 add $out, 16, $out |
|
168 orn %g0, $mask, $mask |
|
169 retl |
|
170 stda %f8, [$out + $mask]0xc0 ! partial store |
|
171 .type aes_t4_encrypt,#function |
|
172 .size aes_t4_encrypt,.-aes_t4_encrypt |
|
173 |
|
174 .globl aes_t4_decrypt |
|
175 .align 32 |
|
176 aes_t4_decrypt: |
|
177 andcc $inp, 7, %g1 ! is input aligned? |
|
178 andn $inp, 7, $inp |
|
179 |
|
180 ldx [$key + 0], %g4 |
|
181 ldx [$key + 8], %g5 |
|
182 |
|
183 ldx [$inp + 0], %o4 |
|
184 bz,pt %icc, 1f |
|
185 ldx [$inp + 8], %o5 |
|
186 ldx [$inp + 16], $inp |
|
187 sll %g1, 3, %g1 |
|
188 sub %g0, %g1, %o3 |
|
189 sllx %o4, %g1, %o4 |
|
190 sllx %o5, %g1, %g1 |
|
191 srlx %o5, %o3, %o5 |
|
192 srlx $inp, %o3, %o3 |
|
193 or %o5, %o4, %o4 |
|
194 or %o3, %g1, %o5 |
|
195 1: |
|
196 ld [$key + 240], $rounds |
|
197 ldd [$key + 16], %f12 |
|
198 ldd [$key + 24], %f14 |
|
199 xor %g4, %o4, %o4 |
|
200 xor %g5, %o5, %o5 |
|
201 movxtod %o4, %f0 |
|
202 movxtod %o5, %f2 |
|
203 srl $rounds, 1, $rounds |
|
204 ldd [$key + 32], %f16 |
|
205 sub $rounds, 1, $rounds |
|
206 ldd [$key + 40], %f18 |
|
207 add $key, 48, $key |
|
208 |
|
209 .Ldec: |
|
210 aes_dround01 %f12, %f0, %f2, %f4 |
|
211 aes_dround23 %f14, %f0, %f2, %f2 |
|
212 ldd [$key + 0], %f12 |
|
213 ldd [$key + 8], %f14 |
|
214 sub $rounds,1,$rounds |
|
215 aes_dround01 %f16, %f4, %f2, %f0 |
|
216 aes_dround23 %f18, %f4, %f2, %f2 |
|
217 ldd [$key + 16], %f16 |
|
218 ldd [$key + 24], %f18 |
|
219 brnz,pt $rounds, .Ldec |
|
220 add $key, 32, $key |
|
221 |
|
222 andcc $out, 7, $tmp ! is output aligned? |
|
223 aes_dround01 %f12, %f0, %f2, %f4 |
|
224 aes_dround23 %f14, %f0, %f2, %f2 |
|
225 aes_dround01_l %f16, %f4, %f2, %f0 |
|
226 aes_dround23_l %f18, %f4, %f2, %f2 |
|
227 |
|
228 bnz,pn %icc, 2f |
|
229 nop |
|
230 |
|
231 std %f0, [$out + 0] |
|
232 retl |
|
233 std %f2, [$out + 8] |
|
234 |
|
235 2: alignaddrl $out, %g0, $out |
|
236 mov 0xff, $mask |
|
237 srl $mask, $tmp, $mask |
|
238 |
|
239 faligndata %f0, %f0, %f4 |
|
240 faligndata %f0, %f2, %f6 |
|
241 faligndata %f2, %f2, %f8 |
|
242 |
|
243 stda %f4, [$out + $mask]0xc0 ! partial store |
|
244 std %f6, [$out + 8] |
|
245 add $out, 16, $out |
|
246 orn %g0, $mask, $mask |
|
247 retl |
|
248 stda %f8, [$out + $mask]0xc0 ! partial store |
|
249 .type aes_t4_decrypt,#function |
|
250 .size aes_t4_decrypt,.-aes_t4_decrypt |
|
251 ___ |
|
252 } |
|
253 |
|
254 ###################################################################### |
|
255 # key setup subroutines |
|
256 # |
|
257 { |
|
258 my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); |
|
259 $code.=<<___; |
|
260 .globl aes_t4_set_encrypt_key |
|
261 .align 32 |
|
262 aes_t4_set_encrypt_key: |
|
263 .Lset_encrypt_key: |
|
264 and $inp, 7, $tmp |
|
265 alignaddr $inp, %g0, $inp |
|
266 cmp $bits, 192 |
|
267 ldd [$inp + 0], %f0 |
|
268 bl,pt %icc,.L128 |
|
269 ldd [$inp + 8], %f2 |
|
270 |
|
271 be,pt %icc,.L192 |
|
272 ldd [$inp + 16], %f4 |
|
273 brz,pt $tmp, .L256aligned |
|
274 ldd [$inp + 24], %f6 |
|
275 |
|
276 ldd [$inp + 32], %f8 |
|
277 faligndata %f0, %f2, %f0 |
|
278 faligndata %f2, %f4, %f2 |
|
279 faligndata %f4, %f6, %f4 |
|
280 faligndata %f6, %f8, %f6 |
|
281 .L256aligned: |
|
282 ___ |
|
283 for ($i=0; $i<6; $i++) { |
|
284 $code.=<<___; |
|
285 std %f0, [$out + `32*$i+0`] |
|
286 aes_kexpand1 %f0, %f6, $i, %f0 |
|
287 std %f2, [$out + `32*$i+8`] |
|
288 aes_kexpand2 %f2, %f0, %f2 |
|
289 std %f4, [$out + `32*$i+16`] |
|
290 aes_kexpand0 %f4, %f2, %f4 |
|
291 std %f6, [$out + `32*$i+24`] |
|
292 aes_kexpand2 %f6, %f4, %f6 |
|
293 ___ |
|
294 } |
|
295 $code.=<<___; |
|
296 std %f0, [$out + `32*$i+0`] |
|
297 aes_kexpand1 %f0, %f6, $i, %f0 |
|
298 std %f2, [$out + `32*$i+8`] |
|
299 aes_kexpand2 %f2, %f0, %f2 |
|
300 std %f4, [$out + `32*$i+16`] |
|
301 std %f6, [$out + `32*$i+24`] |
|
302 std %f0, [$out + `32*$i+32`] |
|
303 std %f2, [$out + `32*$i+40`] |
|
304 |
|
305 mov 14, $tmp |
|
306 st $tmp, [$out + 240] |
|
307 retl |
|
308 xor %o0, %o0, %o0 |
|
309 |
|
310 .align 16 |
|
311 .L192: |
|
312 brz,pt $tmp, .L192aligned |
|
313 nop |
|
314 |
|
315 ldd [$inp + 24], %f6 |
|
316 faligndata %f0, %f2, %f0 |
|
317 faligndata %f2, %f4, %f2 |
|
318 faligndata %f4, %f6, %f4 |
|
319 .L192aligned: |
|
320 ___ |
|
321 for ($i=0; $i<7; $i++) { |
|
322 $code.=<<___; |
|
323 std %f0, [$out + `24*$i+0`] |
|
324 aes_kexpand1 %f0, %f4, $i, %f0 |
|
325 std %f2, [$out + `24*$i+8`] |
|
326 aes_kexpand2 %f2, %f0, %f2 |
|
327 std %f4, [$out + `24*$i+16`] |
|
328 aes_kexpand2 %f4, %f2, %f4 |
|
329 ___ |
|
330 } |
|
331 $code.=<<___; |
|
332 std %f0, [$out + `24*$i+0`] |
|
333 aes_kexpand1 %f0, %f4, $i, %f0 |
|
334 std %f2, [$out + `24*$i+8`] |
|
335 aes_kexpand2 %f2, %f0, %f2 |
|
336 std %f4, [$out + `24*$i+16`] |
|
337 std %f0, [$out + `24*$i+24`] |
|
338 std %f2, [$out + `24*$i+32`] |
|
339 |
|
340 mov 12, $tmp |
|
341 st $tmp, [$out + 240] |
|
342 retl |
|
343 xor %o0, %o0, %o0 |
|
344 |
|
345 .align 16 |
|
346 .L128: |
|
347 brz,pt $tmp, .L128aligned |
|
348 nop |
|
349 |
|
350 ldd [$inp + 16], %f4 |
|
351 faligndata %f0, %f2, %f0 |
|
352 faligndata %f2, %f4, %f2 |
|
353 .L128aligned: |
|
354 ___ |
|
355 for ($i=0; $i<10; $i++) { |
|
356 $code.=<<___; |
|
357 std %f0, [$out + `16*$i+0`] |
|
358 aes_kexpand1 %f0, %f2, $i, %f0 |
|
359 std %f2, [$out + `16*$i+8`] |
|
360 aes_kexpand2 %f2, %f0, %f2 |
|
361 ___ |
|
362 } |
|
363 $code.=<<___; |
|
364 std %f0, [$out + `16*$i+0`] |
|
365 std %f2, [$out + `16*$i+8`] |
|
366 |
|
367 mov 10, $tmp |
|
368 st $tmp, [$out + 240] |
|
369 retl |
|
370 xor %o0, %o0, %o0 |
|
371 .type aes_t4_set_encrypt_key,#function |
|
372 .size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key |
|
373 |
|
374 .globl aes_t4_set_decrypt_key |
|
375 .align 32 |
|
376 aes_t4_set_decrypt_key: |
|
377 mov %o7, %o5 |
|
378 call .Lset_encrypt_key |
|
379 nop |
|
380 |
|
381 mov %o5, %o7 |
|
382 sll $tmp, 4, $inp ! $tmp is number of rounds |
|
383 add $tmp, 2, $tmp |
|
384 add $out, $inp, $inp ! $inp=$out+16*rounds |
|
385 srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4 |
|
386 |
|
387 .Lkey_flip: |
|
388 ldd [$out + 0], %f0 |
|
389 ldd [$out + 8], %f2 |
|
390 ldd [$out + 16], %f4 |
|
391 ldd [$out + 24], %f6 |
|
392 ldd [$inp + 0], %f8 |
|
393 ldd [$inp + 8], %f10 |
|
394 ldd [$inp - 16], %f12 |
|
395 ldd [$inp - 8], %f14 |
|
396 sub $tmp, 1, $tmp |
|
397 std %f0, [$inp + 0] |
|
398 std %f2, [$inp + 8] |
|
399 std %f4, [$inp - 16] |
|
400 std %f6, [$inp - 8] |
|
401 std %f8, [$out + 0] |
|
402 std %f10, [$out + 8] |
|
403 std %f12, [$out + 16] |
|
404 std %f14, [$out + 24] |
|
405 add $out, 32, $out |
|
406 brnz $tmp, .Lkey_flip |
|
407 sub $inp, 32, $inp |
|
408 |
|
409 retl |
|
410 xor %o0, %o0, %o0 |
|
411 .type aes_t4_set_decrypt_key,#function |
|
412 .size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key |
|
413 ___ |
|
414 } |
|
415 |
|
416 {{{ |
|
417 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); |
|
418 my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); |
|
419 |
|
420 $code.=<<___; |
|
421 .align 32 |
|
422 _aes128_encrypt_1x: |
|
423 ___ |
|
424 for ($i=0; $i<4; $i++) { |
|
425 $code.=<<___; |
|
426 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
427 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
428 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
429 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
430 ___ |
|
431 } |
|
432 $code.=<<___; |
|
433 aes_eround01 %f48, %f0, %f2, %f4 |
|
434 aes_eround23 %f50, %f0, %f2, %f2 |
|
435 aes_eround01_l %f52, %f4, %f2, %f0 |
|
436 retl |
|
437 aes_eround23_l %f54, %f4, %f2, %f2 |
|
438 .type _aes128_encrypt_1x,#function |
|
439 .size _aes128_encrypt_1x,.-_aes128_encrypt_1x |
|
440 |
|
441 .align 32 |
|
442 _aes128_encrypt_2x: |
|
443 ___ |
|
444 for ($i=0; $i<4; $i++) { |
|
445 $code.=<<___; |
|
446 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
447 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
448 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
449 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
450 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
451 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
452 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
453 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
454 ___ |
|
455 } |
|
456 $code.=<<___; |
|
457 aes_eround01 %f48, %f0, %f2, %f8 |
|
458 aes_eround23 %f50, %f0, %f2, %f2 |
|
459 aes_eround01 %f48, %f4, %f6, %f10 |
|
460 aes_eround23 %f50, %f4, %f6, %f6 |
|
461 aes_eround01_l %f52, %f8, %f2, %f0 |
|
462 aes_eround23_l %f54, %f8, %f2, %f2 |
|
463 aes_eround01_l %f52, %f10, %f6, %f4 |
|
464 retl |
|
465 aes_eround23_l %f54, %f10, %f6, %f6 |
|
466 .type _aes128_encrypt_2x,#function |
|
467 .size _aes128_encrypt_2x,.-_aes128_encrypt_2x |
|
468 |
|
469 .align 32 |
|
470 _aes128_loadkey: |
|
471 ldx [$key + 0], %g4 |
|
472 ldx [$key + 8], %g5 |
|
473 ___ |
|
474 for ($i=2; $i<22;$i++) { # load key schedule |
|
475 $code.=<<___; |
|
476 ldd [$key + `8*$i`], %f`12+2*$i` |
|
477 ___ |
|
478 } |
|
479 $code.=<<___; |
|
480 retl |
|
481 nop |
|
482 .type _aes128_loadkey,#function |
|
483 .size _aes128_loadkey,.-_aes128_loadkey |
|
484 _aes128_load_enckey=_aes128_loadkey |
|
485 _aes128_load_deckey=_aes128_loadkey |
|
486 |
|
487 ___ |
|
488 |
|
489 &alg_cbc_encrypt_implement("aes",128); |
|
490 if ($::evp) { |
|
491 &alg_ctr32_implement("aes",128); |
|
492 &alg_xts_implement("aes",128,"en"); |
|
493 &alg_xts_implement("aes",128,"de"); |
|
494 } |
|
495 &alg_cbc_decrypt_implement("aes",128); |
|
496 |
|
497 $code.=<<___; |
|
498 .align 32 |
|
499 _aes128_decrypt_1x: |
|
500 ___ |
|
501 for ($i=0; $i<4; $i++) { |
|
502 $code.=<<___; |
|
503 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
504 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
505 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
506 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
507 ___ |
|
508 } |
|
509 $code.=<<___; |
|
510 aes_dround01 %f48, %f0, %f2, %f4 |
|
511 aes_dround23 %f50, %f0, %f2, %f2 |
|
512 aes_dround01_l %f52, %f4, %f2, %f0 |
|
513 retl |
|
514 aes_dround23_l %f54, %f4, %f2, %f2 |
|
515 .type _aes128_decrypt_1x,#function |
|
516 .size _aes128_decrypt_1x,.-_aes128_decrypt_1x |
|
517 |
|
518 .align 32 |
|
519 _aes128_decrypt_2x: |
|
520 ___ |
|
521 for ($i=0; $i<4; $i++) { |
|
522 $code.=<<___; |
|
523 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
524 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
525 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
526 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
527 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
528 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
529 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
530 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
531 ___ |
|
532 } |
|
533 $code.=<<___; |
|
534 aes_dround01 %f48, %f0, %f2, %f8 |
|
535 aes_dround23 %f50, %f0, %f2, %f2 |
|
536 aes_dround01 %f48, %f4, %f6, %f10 |
|
537 aes_dround23 %f50, %f4, %f6, %f6 |
|
538 aes_dround01_l %f52, %f8, %f2, %f0 |
|
539 aes_dround23_l %f54, %f8, %f2, %f2 |
|
540 aes_dround01_l %f52, %f10, %f6, %f4 |
|
541 retl |
|
542 aes_dround23_l %f54, %f10, %f6, %f6 |
|
543 .type _aes128_decrypt_2x,#function |
|
544 .size _aes128_decrypt_2x,.-_aes128_decrypt_2x |
|
545 ___ |
|
546 |
|
547 $code.=<<___; |
|
548 .align 32 |
|
549 _aes192_encrypt_1x: |
|
550 ___ |
|
551 for ($i=0; $i<5; $i++) { |
|
552 $code.=<<___; |
|
553 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
554 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
555 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
556 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
557 ___ |
|
558 } |
|
559 $code.=<<___; |
|
560 aes_eround01 %f56, %f0, %f2, %f4 |
|
561 aes_eround23 %f58, %f0, %f2, %f2 |
|
562 aes_eround01_l %f60, %f4, %f2, %f0 |
|
563 retl |
|
564 aes_eround23_l %f62, %f4, %f2, %f2 |
|
565 .type _aes192_encrypt_1x,#function |
|
566 .size _aes192_encrypt_1x,.-_aes192_encrypt_1x |
|
567 |
|
568 .align 32 |
|
569 _aes192_encrypt_2x: |
|
570 ___ |
|
571 for ($i=0; $i<5; $i++) { |
|
572 $code.=<<___; |
|
573 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
574 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
575 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
576 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
577 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
578 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
579 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
580 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
581 ___ |
|
582 } |
|
583 $code.=<<___; |
|
584 aes_eround01 %f56, %f0, %f2, %f8 |
|
585 aes_eround23 %f58, %f0, %f2, %f2 |
|
586 aes_eround01 %f56, %f4, %f6, %f10 |
|
587 aes_eround23 %f58, %f4, %f6, %f6 |
|
588 aes_eround01_l %f60, %f8, %f2, %f0 |
|
589 aes_eround23_l %f62, %f8, %f2, %f2 |
|
590 aes_eround01_l %f60, %f10, %f6, %f4 |
|
591 retl |
|
592 aes_eround23_l %f62, %f10, %f6, %f6 |
|
593 .type _aes192_encrypt_2x,#function |
|
594 .size _aes192_encrypt_2x,.-_aes192_encrypt_2x |
|
595 |
|
596 .align 32 |
|
597 _aes256_encrypt_1x: |
|
598 aes_eround01 %f16, %f0, %f2, %f4 |
|
599 aes_eround23 %f18, %f0, %f2, %f2 |
|
600 ldd [$key + 208], %f16 |
|
601 ldd [$key + 216], %f18 |
|
602 aes_eround01 %f20, %f4, %f2, %f0 |
|
603 aes_eround23 %f22, %f4, %f2, %f2 |
|
604 ldd [$key + 224], %f20 |
|
605 ldd [$key + 232], %f22 |
|
606 ___ |
|
607 for ($i=1; $i<6; $i++) { |
|
608 $code.=<<___; |
|
609 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
610 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
611 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
612 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
613 ___ |
|
614 } |
|
615 $code.=<<___; |
|
616 aes_eround01 %f16, %f0, %f2, %f4 |
|
617 aes_eround23 %f18, %f0, %f2, %f2 |
|
618 ldd [$key + 16], %f16 |
|
619 ldd [$key + 24], %f18 |
|
620 aes_eround01_l %f20, %f4, %f2, %f0 |
|
621 aes_eround23_l %f22, %f4, %f2, %f2 |
|
622 ldd [$key + 32], %f20 |
|
623 retl |
|
624 ldd [$key + 40], %f22 |
|
625 .type _aes256_encrypt_1x,#function |
|
626 .size _aes256_encrypt_1x,.-_aes256_encrypt_1x |
|
627 |
|
628 .align 32 |
|
629 _aes256_encrypt_2x: |
|
630 aes_eround01 %f16, %f0, %f2, %f8 |
|
631 aes_eround23 %f18, %f0, %f2, %f2 |
|
632 aes_eround01 %f16, %f4, %f6, %f10 |
|
633 aes_eround23 %f18, %f4, %f6, %f6 |
|
634 ldd [$key + 208], %f16 |
|
635 ldd [$key + 216], %f18 |
|
636 aes_eround01 %f20, %f8, %f2, %f0 |
|
637 aes_eround23 %f22, %f8, %f2, %f2 |
|
638 aes_eround01 %f20, %f10, %f6, %f4 |
|
639 aes_eround23 %f22, %f10, %f6, %f6 |
|
640 ldd [$key + 224], %f20 |
|
641 ldd [$key + 232], %f22 |
|
642 ___ |
|
643 for ($i=1; $i<6; $i++) { |
|
644 $code.=<<___; |
|
645 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
646 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
647 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
648 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
649 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
650 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
651 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
652 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
653 ___ |
|
654 } |
|
655 $code.=<<___; |
|
656 aes_eround01 %f16, %f0, %f2, %f8 |
|
657 aes_eround23 %f18, %f0, %f2, %f2 |
|
658 aes_eround01 %f16, %f4, %f6, %f10 |
|
659 aes_eround23 %f18, %f4, %f6, %f6 |
|
660 ldd [$key + 16], %f16 |
|
661 ldd [$key + 24], %f18 |
|
662 aes_eround01_l %f20, %f8, %f2, %f0 |
|
663 aes_eround23_l %f22, %f8, %f2, %f2 |
|
664 aes_eround01_l %f20, %f10, %f6, %f4 |
|
665 aes_eround23_l %f22, %f10, %f6, %f6 |
|
666 ldd [$key + 32], %f20 |
|
667 retl |
|
668 ldd [$key + 40], %f22 |
|
669 .type _aes256_encrypt_2x,#function |
|
670 .size _aes256_encrypt_2x,.-_aes256_encrypt_2x |
|
671 |
|
672 .align 32 |
|
673 _aes192_loadkey: |
|
674 ldx [$key + 0], %g4 |
|
675 ldx [$key + 8], %g5 |
|
676 ___ |
|
677 for ($i=2; $i<26;$i++) { # load key schedule |
|
678 $code.=<<___; |
|
679 ldd [$key + `8*$i`], %f`12+2*$i` |
|
680 ___ |
|
681 } |
|
682 $code.=<<___; |
|
683 retl |
|
684 nop |
|
685 .type _aes192_loadkey,#function |
|
686 .size _aes192_loadkey,.-_aes192_loadkey |
|
687 _aes256_loadkey=_aes192_loadkey |
|
688 _aes192_load_enckey=_aes192_loadkey |
|
689 _aes192_load_deckey=_aes192_loadkey |
|
690 _aes256_load_enckey=_aes192_loadkey |
|
691 _aes256_load_deckey=_aes192_loadkey |
|
692 ___ |
|
693 |
|
694 &alg_cbc_encrypt_implement("aes",256); |
|
695 &alg_cbc_encrypt_implement("aes",192); |
|
696 if ($::evp) { |
|
697 &alg_ctr32_implement("aes",256); |
|
698 &alg_xts_implement("aes",256,"en"); |
|
699 &alg_xts_implement("aes",256,"de"); |
|
700 &alg_ctr32_implement("aes",192); |
|
701 } |
|
702 &alg_cbc_decrypt_implement("aes",192); |
|
703 &alg_cbc_decrypt_implement("aes",256); |
|
704 |
|
705 $code.=<<___; |
|
706 .align 32 |
|
707 _aes256_decrypt_1x: |
|
708 aes_dround01 %f16, %f0, %f2, %f4 |
|
709 aes_dround23 %f18, %f0, %f2, %f2 |
|
710 ldd [$key + 208], %f16 |
|
711 ldd [$key + 216], %f18 |
|
712 aes_dround01 %f20, %f4, %f2, %f0 |
|
713 aes_dround23 %f22, %f4, %f2, %f2 |
|
714 ldd [$key + 224], %f20 |
|
715 ldd [$key + 232], %f22 |
|
716 ___ |
|
717 for ($i=1; $i<6; $i++) { |
|
718 $code.=<<___; |
|
719 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
720 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
721 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
722 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
723 ___ |
|
724 } |
|
725 $code.=<<___; |
|
726 aes_dround01 %f16, %f0, %f2, %f4 |
|
727 aes_dround23 %f18, %f0, %f2, %f2 |
|
728 ldd [$key + 16], %f16 |
|
729 ldd [$key + 24], %f18 |
|
730 aes_dround01_l %f20, %f4, %f2, %f0 |
|
731 aes_dround23_l %f22, %f4, %f2, %f2 |
|
732 ldd [$key + 32], %f20 |
|
733 retl |
|
734 ldd [$key + 40], %f22 |
|
735 .type _aes256_decrypt_1x,#function |
|
736 .size _aes256_decrypt_1x,.-_aes256_decrypt_1x |
|
737 |
|
738 .align 32 |
|
739 _aes256_decrypt_2x: |
|
740 aes_dround01 %f16, %f0, %f2, %f8 |
|
741 aes_dround23 %f18, %f0, %f2, %f2 |
|
742 aes_dround01 %f16, %f4, %f6, %f10 |
|
743 aes_dround23 %f18, %f4, %f6, %f6 |
|
744 ldd [$key + 208], %f16 |
|
745 ldd [$key + 216], %f18 |
|
746 aes_dround01 %f20, %f8, %f2, %f0 |
|
747 aes_dround23 %f22, %f8, %f2, %f2 |
|
748 aes_dround01 %f20, %f10, %f6, %f4 |
|
749 aes_dround23 %f22, %f10, %f6, %f6 |
|
750 ldd [$key + 224], %f20 |
|
751 ldd [$key + 232], %f22 |
|
752 ___ |
|
753 for ($i=1; $i<6; $i++) { |
|
754 $code.=<<___; |
|
755 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
756 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
757 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
758 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
759 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
760 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
761 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
762 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
763 ___ |
|
764 } |
|
765 $code.=<<___; |
|
766 aes_dround01 %f16, %f0, %f2, %f8 |
|
767 aes_dround23 %f18, %f0, %f2, %f2 |
|
768 aes_dround01 %f16, %f4, %f6, %f10 |
|
769 aes_dround23 %f18, %f4, %f6, %f6 |
|
770 ldd [$key + 16], %f16 |
|
771 ldd [$key + 24], %f18 |
|
772 aes_dround01_l %f20, %f8, %f2, %f0 |
|
773 aes_dround23_l %f22, %f8, %f2, %f2 |
|
774 aes_dround01_l %f20, %f10, %f6, %f4 |
|
775 aes_dround23_l %f22, %f10, %f6, %f6 |
|
776 ldd [$key + 32], %f20 |
|
777 retl |
|
778 ldd [$key + 40], %f22 |
|
779 .type _aes256_decrypt_2x,#function |
|
780 .size _aes256_decrypt_2x,.-_aes256_decrypt_2x |
|
781 |
|
782 .align 32 |
|
783 _aes192_decrypt_1x: |
|
784 ___ |
|
785 for ($i=0; $i<5; $i++) { |
|
786 $code.=<<___; |
|
787 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
788 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
789 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
790 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
791 ___ |
|
792 } |
|
793 $code.=<<___; |
|
794 aes_dround01 %f56, %f0, %f2, %f4 |
|
795 aes_dround23 %f58, %f0, %f2, %f2 |
|
796 aes_dround01_l %f60, %f4, %f2, %f0 |
|
797 retl |
|
798 aes_dround23_l %f62, %f4, %f2, %f2 |
|
799 .type _aes192_decrypt_1x,#function |
|
800 .size _aes192_decrypt_1x,.-_aes192_decrypt_1x |
|
801 |
|
802 .align 32 |
|
803 _aes192_decrypt_2x: |
|
804 ___ |
|
805 for ($i=0; $i<5; $i++) { |
|
806 $code.=<<___; |
|
807 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
808 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
809 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
810 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
811 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
812 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
813 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
814 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
815 ___ |
|
816 } |
|
817 $code.=<<___; |
|
818 aes_dround01 %f56, %f0, %f2, %f8 |
|
819 aes_dround23 %f58, %f0, %f2, %f2 |
|
820 aes_dround01 %f56, %f4, %f6, %f10 |
|
821 aes_dround23 %f58, %f4, %f6, %f6 |
|
822 aes_dround01_l %f60, %f8, %f2, %f0 |
|
823 aes_dround23_l %f62, %f8, %f2, %f2 |
|
824 aes_dround01_l %f60, %f10, %f6, %f4 |
|
825 retl |
|
826 aes_dround23_l %f62, %f10, %f6, %f6 |
|
827 .type _aes192_decrypt_2x,#function |
|
828 .size _aes192_decrypt_2x,.-_aes192_decrypt_2x |
|
829 ___ |
|
830 }}} |
|
831 |
|
832 if (!$::evp) { |
|
833 $code.=<<___; |
|
834 .global AES_encrypt |
|
835 AES_encrypt=aes_t4_encrypt |
|
836 .global AES_decrypt |
|
837 AES_decrypt=aes_t4_decrypt |
|
838 .global AES_set_encrypt_key |
|
839 .align 32 |
|
840 AES_set_encrypt_key: |
|
841 andcc %o2, 7, %g0 ! check alignment |
|
842 bnz,a,pn %icc, 1f |
|
843 mov -1, %o0 |
|
844 brz,a,pn %o0, 1f |
|
845 mov -1, %o0 |
|
846 brz,a,pn %o2, 1f |
|
847 mov -1, %o0 |
|
848 andncc %o1, 0x1c0, %g0 |
|
849 bnz,a,pn %icc, 1f |
|
850 mov -2, %o0 |
|
851 cmp %o1, 128 |
|
852 bl,a,pn %icc, 1f |
|
853 mov -2, %o0 |
|
854 b aes_t4_set_encrypt_key |
|
855 nop |
|
856 1: retl |
|
857 nop |
|
858 .type AES_set_encrypt_key,#function |
|
859 .size AES_set_encrypt_key,.-AES_set_encrypt_key |
|
860 |
|
861 .global AES_set_decrypt_key |
|
862 .align 32 |
|
863 AES_set_decrypt_key: |
|
864 andcc %o2, 7, %g0 ! check alignment |
|
865 bnz,a,pn %icc, 1f |
|
866 mov -1, %o0 |
|
867 brz,a,pn %o0, 1f |
|
868 mov -1, %o0 |
|
869 brz,a,pn %o2, 1f |
|
870 mov -1, %o0 |
|
871 andncc %o1, 0x1c0, %g0 |
|
872 bnz,a,pn %icc, 1f |
|
873 mov -2, %o0 |
|
874 cmp %o1, 128 |
|
875 bl,a,pn %icc, 1f |
|
876 mov -2, %o0 |
|
877 b aes_t4_set_decrypt_key |
|
878 nop |
|
879 1: retl |
|
880 nop |
|
881 .type AES_set_decrypt_key,#function |
|
882 .size AES_set_decrypt_key,.-AES_set_decrypt_key |
|
883 ___ |
|
884 |
|
885 my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); |
|
886 |
|
887 $code.=<<___; |
|
888 .globl AES_cbc_encrypt |
|
889 .align 32 |
|
890 AES_cbc_encrypt: |
|
891 ld [$key + 240], %g1 |
|
892 nop |
|
893 brz $enc, .Lcbc_decrypt |
|
894 cmp %g1, 12 |
|
895 |
|
896 bl,pt %icc, aes128_t4_cbc_encrypt |
|
897 nop |
|
898 be,pn %icc, aes192_t4_cbc_encrypt |
|
899 nop |
|
900 ba aes256_t4_cbc_encrypt |
|
901 nop |
|
902 |
|
903 .Lcbc_decrypt: |
|
904 bl,pt %icc, aes128_t4_cbc_decrypt |
|
905 nop |
|
906 be,pn %icc, aes192_t4_cbc_decrypt |
|
907 nop |
|
908 ba aes256_t4_cbc_decrypt |
|
909 nop |
|
910 .type AES_cbc_encrypt,#function |
|
911 .size AES_cbc_encrypt,.-AES_cbc_encrypt |
|
912 ___ |
|
913 } |
|
914 $code.=<<___; |
|
915 .asciz "AES for SPARC T4, David S. Miller, Andy Polyakov" |
|
916 .align 4 |
|
917 ___ |
|
918 |
|
919 &emit_assembler(); |
|
920 |
|
921 close STDOUT; |