1 #!/usr/bin/env perl |
|
2 |
|
3 # ==================================================================== |
|
4 # Written by David S. Miller <[email protected]> and Andy Polyakov |
|
5 # <[email protected]>. The module is licensed under 2-clause BSD |
|
6 # license. October 2012. All rights reserved. |
|
7 # ==================================================================== |
|
8 |
|
9 ###################################################################### |
|
10 # AES for SPARC T4. |
|
11 # |
|
12 # AES round instructions complete in 3 cycles and can be issued every |
|
13 # cycle. It means that round calculations should take 4*rounds cycles, |
|
14 # because any given round instruction depends on result of *both* |
|
15 # previous instructions: |
|
16 # |
|
17 # |0 |1 |2 |3 |4 |
|
18 # |01|01|01| |
|
19 # |23|23|23| |
|
20 # |01|01|... |
|
21 # |23|... |
|
22 # |
|
23 # Provided that fxor [with IV] takes 3 cycles to complete, critical |
|
24 # path length for CBC encrypt would be 3+4*rounds, or in other words |
|
25 # it should process one byte in at least (3+4*rounds)/16 cycles. This |
|
26 # estimate doesn't account for "collateral" instructions, such as |
|
27 # fetching input from memory, xor-ing it with zero-round key and |
|
28 # storing the result. Yet, *measured* performance [for data aligned |
|
29 # at 64-bit boundary!] deviates from this equation by less than 0.5%: |
|
30 # |
|
31 # 128-bit key 192- 256- |
|
32 # CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90 |
|
33 # (*) numbers after slash are for |
|
34 # misaligned data. |
|
35 # |
|
36 # Out-of-order execution logic managed to fully overlap "collateral" |
|
37 # instructions with those on critical path. Amazing! |
|
38 # |
|
39 # As with Intel AES-NI, question is if it's possible to improve |
|
40 # performance of parallelizeable modes by interleaving round |
|
41 # instructions. Provided round instruction latency and throughput |
|
42 # optimal interleave factor is 2. But can we expect 2x performance |
|
43 # improvement? Well, as round instructions can be issued one per |
|
44 # cycle, they don't saturate the 2-way issue pipeline and therefore |
|
45 # there is room for "collateral" calculations... Yet, 2x speed-up |
|
46 # over CBC encrypt remains unattaintable: |
|
47 # |
|
48 # 128-bit key 192- 256- |
|
49 # CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61 |
|
50 # CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61 |
|
51 # (*) numbers after slash are for |
|
52 # misaligned data. |
|
53 # |
|
54 # Estimates based on amount of instructions under assumption that |
|
55 # round instructions are not pairable with any other instruction |
|
56 # suggest that latter is the actual case and pipeline runs |
|
57 # underutilized. It should be noted that T4 out-of-order execution |
|
58 # logic is so capable that performance gain from 2x interleave is |
|
59 # not even impressive, ~7-13% over non-interleaved code, largest |
|
60 # for 256-bit keys. |
|
61 |
|
62 # To anchor to something else, software implementation processes |
|
63 # one byte in 29 cycles with 128-bit key on same processor. Intel |
|
64 # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts |
|
65 # in 0.93, naturally with AES-NI. |
|
66 |
|
67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
|
68 push(@INC,"${dir}","${dir}../../perlasm"); |
|
69 require "sparcv9_modes.pl"; |
|
70 |
|
71 &asm_init(@ARGV); |
|
72 |
|
73 $::evp=1; # if $evp is set to 0, script generates module with |
|
74 # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry |
|
75 # points. These however are not fully compatible with openssl/aes.h, |
|
76 # because they expect AES_KEY to be aligned at 64-bit boundary. When |
|
77 # used through EVP, alignment is arranged at EVP layer. Second thing |
|
78 # that is arranged by EVP is at least 32-bit alignment of IV. |
|
79 |
|
80 ###################################################################### |
|
81 # single-round subroutines |
|
82 # |
|
83 { |
|
84 my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); |
|
85 |
|
86 $code=<<___; |
|
87 .text |
|
88 |
|
89 .globl aes_t4_encrypt |
|
90 .align 32 |
|
91 aes_t4_encrypt: |
|
92 andcc $inp, 7, %g1 ! is input aligned? |
|
93 andn $inp, 7, $inp |
|
94 |
|
95 ldx [$key + 0], %g4 |
|
96 ldx [$key + 8], %g5 |
|
97 |
|
98 ldx [$inp + 0], %o4 |
|
99 bz,pt %icc, 1f |
|
100 ldx [$inp + 8], %o5 |
|
101 ldx [$inp + 16], $inp |
|
102 sll %g1, 3, %g1 |
|
103 sub %g0, %g1, %o3 |
|
104 sllx %o4, %g1, %o4 |
|
105 sllx %o5, %g1, %g1 |
|
106 srlx %o5, %o3, %o5 |
|
107 srlx $inp, %o3, %o3 |
|
108 or %o5, %o4, %o4 |
|
109 or %o3, %g1, %o5 |
|
110 1: |
|
111 ld [$key + 240], $rounds |
|
112 ldd [$key + 16], %f12 |
|
113 ldd [$key + 24], %f14 |
|
114 xor %g4, %o4, %o4 |
|
115 xor %g5, %o5, %o5 |
|
116 movxtod %o4, %f0 |
|
117 movxtod %o5, %f2 |
|
118 srl $rounds, 1, $rounds |
|
119 ldd [$key + 32], %f16 |
|
120 sub $rounds, 1, $rounds |
|
121 ldd [$key + 40], %f18 |
|
122 add $key, 48, $key |
|
123 |
|
124 .Lenc: |
|
125 aes_eround01 %f12, %f0, %f2, %f4 |
|
126 aes_eround23 %f14, %f0, %f2, %f2 |
|
127 ldd [$key + 0], %f12 |
|
128 ldd [$key + 8], %f14 |
|
129 sub $rounds,1,$rounds |
|
130 aes_eround01 %f16, %f4, %f2, %f0 |
|
131 aes_eround23 %f18, %f4, %f2, %f2 |
|
132 ldd [$key + 16], %f16 |
|
133 ldd [$key + 24], %f18 |
|
134 brnz,pt $rounds, .Lenc |
|
135 add $key, 32, $key |
|
136 |
|
137 andcc $out, 7, $tmp ! is output aligned? |
|
138 aes_eround01 %f12, %f0, %f2, %f4 |
|
139 aes_eround23 %f14, %f0, %f2, %f2 |
|
140 aes_eround01_l %f16, %f4, %f2, %f0 |
|
141 aes_eround23_l %f18, %f4, %f2, %f2 |
|
142 |
|
143 bnz,pn %icc, 2f |
|
144 nop |
|
145 |
|
146 std %f0, [$out + 0] |
|
147 retl |
|
148 std %f2, [$out + 8] |
|
149 |
|
150 2: alignaddrl $out, %g0, $out |
|
151 mov 0xff, $mask |
|
152 srl $mask, $tmp, $mask |
|
153 |
|
154 faligndata %f0, %f0, %f4 |
|
155 faligndata %f0, %f2, %f6 |
|
156 faligndata %f2, %f2, %f8 |
|
157 |
|
158 stda %f4, [$out + $mask]0xc0 ! partial store |
|
159 std %f6, [$out + 8] |
|
160 add $out, 16, $out |
|
161 orn %g0, $mask, $mask |
|
162 retl |
|
163 stda %f8, [$out + $mask]0xc0 ! partial store |
|
164 .type aes_t4_encrypt,#function |
|
165 .size aes_t4_encrypt,.-aes_t4_encrypt |
|
166 |
|
167 .globl aes_t4_decrypt |
|
168 .align 32 |
|
169 aes_t4_decrypt: |
|
170 andcc $inp, 7, %g1 ! is input aligned? |
|
171 andn $inp, 7, $inp |
|
172 |
|
173 ldx [$key + 0], %g4 |
|
174 ldx [$key + 8], %g5 |
|
175 |
|
176 ldx [$inp + 0], %o4 |
|
177 bz,pt %icc, 1f |
|
178 ldx [$inp + 8], %o5 |
|
179 ldx [$inp + 16], $inp |
|
180 sll %g1, 3, %g1 |
|
181 sub %g0, %g1, %o3 |
|
182 sllx %o4, %g1, %o4 |
|
183 sllx %o5, %g1, %g1 |
|
184 srlx %o5, %o3, %o5 |
|
185 srlx $inp, %o3, %o3 |
|
186 or %o5, %o4, %o4 |
|
187 or %o3, %g1, %o5 |
|
188 1: |
|
189 ld [$key + 240], $rounds |
|
190 ldd [$key + 16], %f12 |
|
191 ldd [$key + 24], %f14 |
|
192 xor %g4, %o4, %o4 |
|
193 xor %g5, %o5, %o5 |
|
194 movxtod %o4, %f0 |
|
195 movxtod %o5, %f2 |
|
196 srl $rounds, 1, $rounds |
|
197 ldd [$key + 32], %f16 |
|
198 sub $rounds, 1, $rounds |
|
199 ldd [$key + 40], %f18 |
|
200 add $key, 48, $key |
|
201 |
|
202 .Ldec: |
|
203 aes_dround01 %f12, %f0, %f2, %f4 |
|
204 aes_dround23 %f14, %f0, %f2, %f2 |
|
205 ldd [$key + 0], %f12 |
|
206 ldd [$key + 8], %f14 |
|
207 sub $rounds,1,$rounds |
|
208 aes_dround01 %f16, %f4, %f2, %f0 |
|
209 aes_dround23 %f18, %f4, %f2, %f2 |
|
210 ldd [$key + 16], %f16 |
|
211 ldd [$key + 24], %f18 |
|
212 brnz,pt $rounds, .Ldec |
|
213 add $key, 32, $key |
|
214 |
|
215 andcc $out, 7, $tmp ! is output aligned? |
|
216 aes_dround01 %f12, %f0, %f2, %f4 |
|
217 aes_dround23 %f14, %f0, %f2, %f2 |
|
218 aes_dround01_l %f16, %f4, %f2, %f0 |
|
219 aes_dround23_l %f18, %f4, %f2, %f2 |
|
220 |
|
221 bnz,pn %icc, 2f |
|
222 nop |
|
223 |
|
224 std %f0, [$out + 0] |
|
225 retl |
|
226 std %f2, [$out + 8] |
|
227 |
|
228 2: alignaddrl $out, %g0, $out |
|
229 mov 0xff, $mask |
|
230 srl $mask, $tmp, $mask |
|
231 |
|
232 faligndata %f0, %f0, %f4 |
|
233 faligndata %f0, %f2, %f6 |
|
234 faligndata %f2, %f2, %f8 |
|
235 |
|
236 stda %f4, [$out + $mask]0xc0 ! partial store |
|
237 std %f6, [$out + 8] |
|
238 add $out, 16, $out |
|
239 orn %g0, $mask, $mask |
|
240 retl |
|
241 stda %f8, [$out + $mask]0xc0 ! partial store |
|
242 .type aes_t4_decrypt,#function |
|
243 .size aes_t4_decrypt,.-aes_t4_decrypt |
|
244 ___ |
|
245 } |
|
246 |
|
247 ###################################################################### |
|
248 # key setup subroutines |
|
249 # |
|
250 { |
|
251 my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); |
|
252 $code.=<<___; |
|
253 .globl aes_t4_set_encrypt_key |
|
254 .align 32 |
|
255 aes_t4_set_encrypt_key: |
|
256 .Lset_encrypt_key: |
|
257 and $inp, 7, $tmp |
|
258 alignaddr $inp, %g0, $inp |
|
259 cmp $bits, 192 |
|
260 ldd [$inp + 0], %f0 |
|
261 bl,pt %icc,.L128 |
|
262 ldd [$inp + 8], %f2 |
|
263 |
|
264 be,pt %icc,.L192 |
|
265 ldd [$inp + 16], %f4 |
|
266 brz,pt $tmp, .L256aligned |
|
267 ldd [$inp + 24], %f6 |
|
268 |
|
269 ldd [$inp + 32], %f8 |
|
270 faligndata %f0, %f2, %f0 |
|
271 faligndata %f2, %f4, %f2 |
|
272 faligndata %f4, %f6, %f4 |
|
273 faligndata %f6, %f8, %f6 |
|
274 .L256aligned: |
|
275 ___ |
|
276 for ($i=0; $i<6; $i++) { |
|
277 $code.=<<___; |
|
278 std %f0, [$out + `32*$i+0`] |
|
279 aes_kexpand1 %f0, %f6, $i, %f0 |
|
280 std %f2, [$out + `32*$i+8`] |
|
281 aes_kexpand2 %f2, %f0, %f2 |
|
282 std %f4, [$out + `32*$i+16`] |
|
283 aes_kexpand0 %f4, %f2, %f4 |
|
284 std %f6, [$out + `32*$i+24`] |
|
285 aes_kexpand2 %f6, %f4, %f6 |
|
286 ___ |
|
287 } |
|
288 $code.=<<___; |
|
289 std %f0, [$out + `32*$i+0`] |
|
290 aes_kexpand1 %f0, %f6, $i, %f0 |
|
291 std %f2, [$out + `32*$i+8`] |
|
292 aes_kexpand2 %f2, %f0, %f2 |
|
293 std %f4, [$out + `32*$i+16`] |
|
294 std %f6, [$out + `32*$i+24`] |
|
295 std %f0, [$out + `32*$i+32`] |
|
296 std %f2, [$out + `32*$i+40`] |
|
297 |
|
298 mov 14, $tmp |
|
299 st $tmp, [$out + 240] |
|
300 retl |
|
301 xor %o0, %o0, %o0 |
|
302 |
|
303 .align 16 |
|
304 .L192: |
|
305 brz,pt $tmp, .L192aligned |
|
306 nop |
|
307 |
|
308 ldd [$inp + 24], %f6 |
|
309 faligndata %f0, %f2, %f0 |
|
310 faligndata %f2, %f4, %f2 |
|
311 faligndata %f4, %f6, %f4 |
|
312 .L192aligned: |
|
313 ___ |
|
314 for ($i=0; $i<7; $i++) { |
|
315 $code.=<<___; |
|
316 std %f0, [$out + `24*$i+0`] |
|
317 aes_kexpand1 %f0, %f4, $i, %f0 |
|
318 std %f2, [$out + `24*$i+8`] |
|
319 aes_kexpand2 %f2, %f0, %f2 |
|
320 std %f4, [$out + `24*$i+16`] |
|
321 aes_kexpand2 %f4, %f2, %f4 |
|
322 ___ |
|
323 } |
|
324 $code.=<<___; |
|
325 std %f0, [$out + `24*$i+0`] |
|
326 aes_kexpand1 %f0, %f4, $i, %f0 |
|
327 std %f2, [$out + `24*$i+8`] |
|
328 aes_kexpand2 %f2, %f0, %f2 |
|
329 std %f4, [$out + `24*$i+16`] |
|
330 std %f0, [$out + `24*$i+24`] |
|
331 std %f2, [$out + `24*$i+32`] |
|
332 |
|
333 mov 12, $tmp |
|
334 st $tmp, [$out + 240] |
|
335 retl |
|
336 xor %o0, %o0, %o0 |
|
337 |
|
338 .align 16 |
|
339 .L128: |
|
340 brz,pt $tmp, .L128aligned |
|
341 nop |
|
342 |
|
343 ldd [$inp + 16], %f4 |
|
344 faligndata %f0, %f2, %f0 |
|
345 faligndata %f2, %f4, %f2 |
|
346 .L128aligned: |
|
347 ___ |
|
348 for ($i=0; $i<10; $i++) { |
|
349 $code.=<<___; |
|
350 std %f0, [$out + `16*$i+0`] |
|
351 aes_kexpand1 %f0, %f2, $i, %f0 |
|
352 std %f2, [$out + `16*$i+8`] |
|
353 aes_kexpand2 %f2, %f0, %f2 |
|
354 ___ |
|
355 } |
|
356 $code.=<<___; |
|
357 std %f0, [$out + `16*$i+0`] |
|
358 std %f2, [$out + `16*$i+8`] |
|
359 |
|
360 mov 10, $tmp |
|
361 st $tmp, [$out + 240] |
|
362 retl |
|
363 xor %o0, %o0, %o0 |
|
364 .type aes_t4_set_encrypt_key,#function |
|
365 .size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key |
|
366 |
|
367 .globl aes_t4_set_decrypt_key |
|
368 .align 32 |
|
369 aes_t4_set_decrypt_key: |
|
370 mov %o7, %o5 |
|
371 call .Lset_encrypt_key |
|
372 nop |
|
373 |
|
374 mov %o5, %o7 |
|
375 sll $tmp, 4, $inp ! $tmp is number of rounds |
|
376 add $tmp, 2, $tmp |
|
377 add $out, $inp, $inp ! $inp=$out+16*rounds |
|
378 srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4 |
|
379 |
|
380 .Lkey_flip: |
|
381 ldd [$out + 0], %f0 |
|
382 ldd [$out + 8], %f2 |
|
383 ldd [$out + 16], %f4 |
|
384 ldd [$out + 24], %f6 |
|
385 ldd [$inp + 0], %f8 |
|
386 ldd [$inp + 8], %f10 |
|
387 ldd [$inp - 16], %f12 |
|
388 ldd [$inp - 8], %f14 |
|
389 sub $tmp, 1, $tmp |
|
390 std %f0, [$inp + 0] |
|
391 std %f2, [$inp + 8] |
|
392 std %f4, [$inp - 16] |
|
393 std %f6, [$inp - 8] |
|
394 std %f8, [$out + 0] |
|
395 std %f10, [$out + 8] |
|
396 std %f12, [$out + 16] |
|
397 std %f14, [$out + 24] |
|
398 add $out, 32, $out |
|
399 brnz $tmp, .Lkey_flip |
|
400 sub $inp, 32, $inp |
|
401 |
|
402 retl |
|
403 xor %o0, %o0, %o0 |
|
404 .type aes_t4_set_decrypt_key,#function |
|
405 .size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key |
|
406 ___ |
|
407 } |
|
408 |
|
409 {{{ |
|
410 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); |
|
411 my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); |
|
412 |
|
413 $code.=<<___; |
|
414 .align 32 |
|
415 _aes128_loadkey: |
|
416 ldx [$key + 0], %g4 |
|
417 ldx [$key + 8], %g5 |
|
418 ___ |
|
419 for ($i=2; $i<22;$i++) { # load key schedule |
|
420 $code.=<<___; |
|
421 ldd [$key + `8*$i`], %f`12+2*$i` |
|
422 ___ |
|
423 } |
|
424 $code.=<<___; |
|
425 retl |
|
426 nop |
|
427 .type _aes128_loadkey,#function |
|
428 .size _aes128_loadkey,.-_aes128_loadkey |
|
429 _aes128_load_enckey=_aes128_loadkey |
|
430 _aes128_load_deckey=_aes128_loadkey |
|
431 |
|
432 .align 32 |
|
433 _aes128_encrypt_1x: |
|
434 ___ |
|
435 for ($i=0; $i<4; $i++) { |
|
436 $code.=<<___; |
|
437 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
438 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
439 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
440 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
441 ___ |
|
442 } |
|
443 $code.=<<___; |
|
444 aes_eround01 %f48, %f0, %f2, %f4 |
|
445 aes_eround23 %f50, %f0, %f2, %f2 |
|
446 aes_eround01_l %f52, %f4, %f2, %f0 |
|
447 retl |
|
448 aes_eround23_l %f54, %f4, %f2, %f2 |
|
449 .type _aes128_encrypt_1x,#function |
|
450 .size _aes128_encrypt_1x,.-_aes128_encrypt_1x |
|
451 |
|
452 .align 32 |
|
453 _aes128_encrypt_2x: |
|
454 ___ |
|
455 for ($i=0; $i<4; $i++) { |
|
456 $code.=<<___; |
|
457 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
458 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
459 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
460 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
461 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
462 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
463 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
464 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
465 ___ |
|
466 } |
|
467 $code.=<<___; |
|
468 aes_eround01 %f48, %f0, %f2, %f8 |
|
469 aes_eround23 %f50, %f0, %f2, %f2 |
|
470 aes_eround01 %f48, %f4, %f6, %f10 |
|
471 aes_eround23 %f50, %f4, %f6, %f6 |
|
472 aes_eround01_l %f52, %f8, %f2, %f0 |
|
473 aes_eround23_l %f54, %f8, %f2, %f2 |
|
474 aes_eround01_l %f52, %f10, %f6, %f4 |
|
475 retl |
|
476 aes_eround23_l %f54, %f10, %f6, %f6 |
|
477 .type _aes128_encrypt_2x,#function |
|
478 .size _aes128_encrypt_2x,.-_aes128_encrypt_2x |
|
479 |
|
480 .align 32 |
|
481 _aes128_decrypt_1x: |
|
482 ___ |
|
483 for ($i=0; $i<4; $i++) { |
|
484 $code.=<<___; |
|
485 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
486 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
487 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
488 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
489 ___ |
|
490 } |
|
491 $code.=<<___; |
|
492 aes_dround01 %f48, %f0, %f2, %f4 |
|
493 aes_dround23 %f50, %f0, %f2, %f2 |
|
494 aes_dround01_l %f52, %f4, %f2, %f0 |
|
495 retl |
|
496 aes_dround23_l %f54, %f4, %f2, %f2 |
|
497 .type _aes128_decrypt_1x,#function |
|
498 .size _aes128_decrypt_1x,.-_aes128_decrypt_1x |
|
499 |
|
500 .align 32 |
|
501 _aes128_decrypt_2x: |
|
502 ___ |
|
503 for ($i=0; $i<4; $i++) { |
|
504 $code.=<<___; |
|
505 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
506 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
507 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
508 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
509 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
510 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
511 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
512 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
513 ___ |
|
514 } |
|
515 $code.=<<___; |
|
516 aes_dround01 %f48, %f0, %f2, %f8 |
|
517 aes_dround23 %f50, %f0, %f2, %f2 |
|
518 aes_dround01 %f48, %f4, %f6, %f10 |
|
519 aes_dround23 %f50, %f4, %f6, %f6 |
|
520 aes_dround01_l %f52, %f8, %f2, %f0 |
|
521 aes_dround23_l %f54, %f8, %f2, %f2 |
|
522 aes_dround01_l %f52, %f10, %f6, %f4 |
|
523 retl |
|
524 aes_dround23_l %f54, %f10, %f6, %f6 |
|
525 .type _aes128_decrypt_2x,#function |
|
526 .size _aes128_decrypt_2x,.-_aes128_decrypt_2x |
|
527 |
|
528 .align 32 |
|
529 _aes192_loadkey: |
|
530 _aes256_loadkey: |
|
531 ldx [$key + 0], %g4 |
|
532 ldx [$key + 8], %g5 |
|
533 ___ |
|
534 for ($i=2; $i<26;$i++) { # load key schedule |
|
535 $code.=<<___; |
|
536 ldd [$key + `8*$i`], %f`12+2*$i` |
|
537 ___ |
|
538 } |
|
539 $code.=<<___; |
|
540 retl |
|
541 nop |
|
542 .type _aes192_loadkey,#function |
|
543 .size _aes192_loadkey,.-_aes192_loadkey |
|
544 _aes192_load_enckey=_aes192_loadkey |
|
545 _aes192_load_deckey=_aes192_loadkey |
|
546 _aes256_load_enckey=_aes192_loadkey |
|
547 _aes256_load_deckey=_aes192_loadkey |
|
548 |
|
549 .align 32 |
|
550 _aes192_encrypt_1x: |
|
551 ___ |
|
552 for ($i=0; $i<5; $i++) { |
|
553 $code.=<<___; |
|
554 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
555 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
556 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
557 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
558 ___ |
|
559 } |
|
560 $code.=<<___; |
|
561 aes_eround01 %f56, %f0, %f2, %f4 |
|
562 aes_eround23 %f58, %f0, %f2, %f2 |
|
563 aes_eround01_l %f60, %f4, %f2, %f0 |
|
564 retl |
|
565 aes_eround23_l %f62, %f4, %f2, %f2 |
|
566 .type _aes192_encrypt_1x,#function |
|
567 .size _aes192_encrypt_1x,.-_aes192_encrypt_1x |
|
568 |
|
569 .align 32 |
|
570 _aes192_encrypt_2x: |
|
571 ___ |
|
572 for ($i=0; $i<5; $i++) { |
|
573 $code.=<<___; |
|
574 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
575 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
576 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
577 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
578 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
579 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
580 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
581 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
582 ___ |
|
583 } |
|
584 $code.=<<___; |
|
585 aes_eround01 %f56, %f0, %f2, %f8 |
|
586 aes_eround23 %f58, %f0, %f2, %f2 |
|
587 aes_eround01 %f56, %f4, %f6, %f10 |
|
588 aes_eround23 %f58, %f4, %f6, %f6 |
|
589 aes_eround01_l %f60, %f8, %f2, %f0 |
|
590 aes_eround23_l %f62, %f8, %f2, %f2 |
|
591 aes_eround01_l %f60, %f10, %f6, %f4 |
|
592 retl |
|
593 aes_eround23_l %f62, %f10, %f6, %f6 |
|
594 .type _aes192_encrypt_2x,#function |
|
595 .size _aes192_encrypt_2x,.-_aes192_encrypt_2x |
|
596 |
|
597 .align 32 |
|
598 _aes192_decrypt_1x: |
|
599 ___ |
|
600 for ($i=0; $i<5; $i++) { |
|
601 $code.=<<___; |
|
602 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
603 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
604 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
605 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
606 ___ |
|
607 } |
|
608 $code.=<<___; |
|
609 aes_dround01 %f56, %f0, %f2, %f4 |
|
610 aes_dround23 %f58, %f0, %f2, %f2 |
|
611 aes_dround01_l %f60, %f4, %f2, %f0 |
|
612 retl |
|
613 aes_dround23_l %f62, %f4, %f2, %f2 |
|
614 .type _aes192_decrypt_1x,#function |
|
615 .size _aes192_decrypt_1x,.-_aes192_decrypt_1x |
|
616 |
|
617 .align 32 |
|
618 _aes192_decrypt_2x: |
|
619 ___ |
|
620 for ($i=0; $i<5; $i++) { |
|
621 $code.=<<___; |
|
622 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
623 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
624 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
625 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
626 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
627 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
628 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
629 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
630 ___ |
|
631 } |
|
632 $code.=<<___; |
|
633 aes_dround01 %f56, %f0, %f2, %f8 |
|
634 aes_dround23 %f58, %f0, %f2, %f2 |
|
635 aes_dround01 %f56, %f4, %f6, %f10 |
|
636 aes_dround23 %f58, %f4, %f6, %f6 |
|
637 aes_dround01_l %f60, %f8, %f2, %f0 |
|
638 aes_dround23_l %f62, %f8, %f2, %f2 |
|
639 aes_dround01_l %f60, %f10, %f6, %f4 |
|
640 retl |
|
641 aes_dround23_l %f62, %f10, %f6, %f6 |
|
642 .type _aes192_decrypt_2x,#function |
|
643 .size _aes192_decrypt_2x,.-_aes192_decrypt_2x |
|
644 |
|
645 .align 32 |
|
646 _aes256_encrypt_1x: |
|
647 aes_eround01 %f16, %f0, %f2, %f4 |
|
648 aes_eround23 %f18, %f0, %f2, %f2 |
|
649 ldd [$key + 208], %f16 |
|
650 ldd [$key + 216], %f18 |
|
651 aes_eround01 %f20, %f4, %f2, %f0 |
|
652 aes_eround23 %f22, %f4, %f2, %f2 |
|
653 ldd [$key + 224], %f20 |
|
654 ldd [$key + 232], %f22 |
|
655 ___ |
|
656 for ($i=1; $i<6; $i++) { |
|
657 $code.=<<___; |
|
658 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
659 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
660 aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
661 aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
662 ___ |
|
663 } |
|
664 $code.=<<___; |
|
665 aes_eround01 %f16, %f0, %f2, %f4 |
|
666 aes_eround23 %f18, %f0, %f2, %f2 |
|
667 ldd [$key + 16], %f16 |
|
668 ldd [$key + 24], %f18 |
|
669 aes_eround01_l %f20, %f4, %f2, %f0 |
|
670 aes_eround23_l %f22, %f4, %f2, %f2 |
|
671 ldd [$key + 32], %f20 |
|
672 retl |
|
673 ldd [$key + 40], %f22 |
|
674 .type _aes256_encrypt_1x,#function |
|
675 .size _aes256_encrypt_1x,.-_aes256_encrypt_1x |
|
676 |
|
677 .align 32 |
|
678 _aes256_encrypt_2x: |
|
679 aes_eround01 %f16, %f0, %f2, %f8 |
|
680 aes_eround23 %f18, %f0, %f2, %f2 |
|
681 aes_eround01 %f16, %f4, %f6, %f10 |
|
682 aes_eround23 %f18, %f4, %f6, %f6 |
|
683 ldd [$key + 208], %f16 |
|
684 ldd [$key + 216], %f18 |
|
685 aes_eround01 %f20, %f8, %f2, %f0 |
|
686 aes_eround23 %f22, %f8, %f2, %f2 |
|
687 aes_eround01 %f20, %f10, %f6, %f4 |
|
688 aes_eround23 %f22, %f10, %f6, %f6 |
|
689 ldd [$key + 224], %f20 |
|
690 ldd [$key + 232], %f22 |
|
691 ___ |
|
692 for ($i=1; $i<6; $i++) { |
|
693 $code.=<<___; |
|
694 aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
695 aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
696 aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
697 aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
698 aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
699 aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
700 aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
701 aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
702 ___ |
|
703 } |
|
704 $code.=<<___; |
|
705 aes_eround01 %f16, %f0, %f2, %f8 |
|
706 aes_eround23 %f18, %f0, %f2, %f2 |
|
707 aes_eround01 %f16, %f4, %f6, %f10 |
|
708 aes_eround23 %f18, %f4, %f6, %f6 |
|
709 ldd [$key + 16], %f16 |
|
710 ldd [$key + 24], %f18 |
|
711 aes_eround01_l %f20, %f8, %f2, %f0 |
|
712 aes_eround23_l %f22, %f8, %f2, %f2 |
|
713 aes_eround01_l %f20, %f10, %f6, %f4 |
|
714 aes_eround23_l %f22, %f10, %f6, %f6 |
|
715 ldd [$key + 32], %f20 |
|
716 retl |
|
717 ldd [$key + 40], %f22 |
|
718 .type _aes256_encrypt_2x,#function |
|
719 .size _aes256_encrypt_2x,.-_aes256_encrypt_2x |
|
720 |
|
721 .align 32 |
|
722 _aes256_decrypt_1x: |
|
723 aes_dround01 %f16, %f0, %f2, %f4 |
|
724 aes_dround23 %f18, %f0, %f2, %f2 |
|
725 ldd [$key + 208], %f16 |
|
726 ldd [$key + 216], %f18 |
|
727 aes_dround01 %f20, %f4, %f2, %f0 |
|
728 aes_dround23 %f22, %f4, %f2, %f2 |
|
729 ldd [$key + 224], %f20 |
|
730 ldd [$key + 232], %f22 |
|
731 ___ |
|
732 for ($i=1; $i<6; $i++) { |
|
733 $code.=<<___; |
|
734 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 |
|
735 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
736 aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 |
|
737 aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 |
|
738 ___ |
|
739 } |
|
740 $code.=<<___; |
|
741 aes_dround01 %f16, %f0, %f2, %f4 |
|
742 aes_dround23 %f18, %f0, %f2, %f2 |
|
743 ldd [$key + 16], %f16 |
|
744 ldd [$key + 24], %f18 |
|
745 aes_dround01_l %f20, %f4, %f2, %f0 |
|
746 aes_dround23_l %f22, %f4, %f2, %f2 |
|
747 ldd [$key + 32], %f20 |
|
748 retl |
|
749 ldd [$key + 40], %f22 |
|
750 .type _aes256_decrypt_1x,#function |
|
751 .size _aes256_decrypt_1x,.-_aes256_decrypt_1x |
|
752 |
|
753 .align 32 |
|
754 _aes256_decrypt_2x: |
|
755 aes_dround01 %f16, %f0, %f2, %f8 |
|
756 aes_dround23 %f18, %f0, %f2, %f2 |
|
757 aes_dround01 %f16, %f4, %f6, %f10 |
|
758 aes_dround23 %f18, %f4, %f6, %f6 |
|
759 ldd [$key + 208], %f16 |
|
760 ldd [$key + 216], %f18 |
|
761 aes_dround01 %f20, %f8, %f2, %f0 |
|
762 aes_dround23 %f22, %f8, %f2, %f2 |
|
763 aes_dround01 %f20, %f10, %f6, %f4 |
|
764 aes_dround23 %f22, %f10, %f6, %f6 |
|
765 ldd [$key + 224], %f20 |
|
766 ldd [$key + 232], %f22 |
|
767 ___ |
|
768 for ($i=1; $i<6; $i++) { |
|
769 $code.=<<___; |
|
770 aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 |
|
771 aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 |
|
772 aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 |
|
773 aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 |
|
774 aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 |
|
775 aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 |
|
776 aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 |
|
777 aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 |
|
778 ___ |
|
779 } |
|
780 $code.=<<___; |
|
781 aes_dround01 %f16, %f0, %f2, %f8 |
|
782 aes_dround23 %f18, %f0, %f2, %f2 |
|
783 aes_dround01 %f16, %f4, %f6, %f10 |
|
784 aes_dround23 %f18, %f4, %f6, %f6 |
|
785 ldd [$key + 16], %f16 |
|
786 ldd [$key + 24], %f18 |
|
787 aes_dround01_l %f20, %f8, %f2, %f0 |
|
788 aes_dround23_l %f22, %f8, %f2, %f2 |
|
789 aes_dround01_l %f20, %f10, %f6, %f4 |
|
790 aes_dround23_l %f22, %f10, %f6, %f6 |
|
791 ldd [$key + 32], %f20 |
|
792 retl |
|
793 ldd [$key + 40], %f22 |
|
794 .type _aes256_decrypt_2x,#function |
|
795 .size _aes256_decrypt_2x,.-_aes256_decrypt_2x |
|
796 ___ |
|
797 |
|
798 &alg_cbc_encrypt_implement("aes",128); |
|
799 &alg_cbc_encrypt_implement("aes",192); |
|
800 &alg_cbc_encrypt_implement("aes",256); |
|
801 |
|
802 &alg_cbc_decrypt_implement("aes",128); |
|
803 &alg_cbc_decrypt_implement("aes",192); |
|
804 &alg_cbc_decrypt_implement("aes",256); |
|
805 |
|
806 if ($::evp) { |
|
807 &alg_ctr32_implement("aes",128); |
|
808 &alg_ctr32_implement("aes",192); |
|
809 &alg_ctr32_implement("aes",256); |
|
810 } |
|
811 }}} |
|
812 |
|
813 if (!$::evp) { |
|
814 $code.=<<___; |
|
815 .global AES_encrypt |
|
816 AES_encrypt=aes_t4_encrypt |
|
817 .global AES_decrypt |
|
818 AES_decrypt=aes_t4_decrypt |
|
819 .global AES_set_encrypt_key |
|
820 .align 32 |
|
821 AES_set_encrypt_key: |
|
822 andcc %o2, 7, %g0 ! check alignment |
|
823 bnz,a,pn %icc, 1f |
|
824 mov -1, %o0 |
|
825 brz,a,pn %o0, 1f |
|
826 mov -1, %o0 |
|
827 brz,a,pn %o2, 1f |
|
828 mov -1, %o0 |
|
829 andncc %o1, 0x1c0, %g0 |
|
830 bnz,a,pn %icc, 1f |
|
831 mov -2, %o0 |
|
832 cmp %o1, 128 |
|
833 bl,a,pn %icc, 1f |
|
834 mov -2, %o0 |
|
835 b aes_t4_set_encrypt_key |
|
836 nop |
|
837 1: retl |
|
838 nop |
|
839 .type AES_set_encrypt_key,#function |
|
840 .size AES_set_encrypt_key,.-AES_set_encrypt_key |
|
841 |
|
842 .global AES_set_decrypt_key |
|
843 .align 32 |
|
844 AES_set_decrypt_key: |
|
845 andcc %o2, 7, %g0 ! check alignment |
|
846 bnz,a,pn %icc, 1f |
|
847 mov -1, %o0 |
|
848 brz,a,pn %o0, 1f |
|
849 mov -1, %o0 |
|
850 brz,a,pn %o2, 1f |
|
851 mov -1, %o0 |
|
852 andncc %o1, 0x1c0, %g0 |
|
853 bnz,a,pn %icc, 1f |
|
854 mov -2, %o0 |
|
855 cmp %o1, 128 |
|
856 bl,a,pn %icc, 1f |
|
857 mov -2, %o0 |
|
858 b aes_t4_set_decrypt_key |
|
859 nop |
|
860 1: retl |
|
861 nop |
|
862 .type AES_set_decrypt_key,#function |
|
863 .size AES_set_decrypt_key,.-AES_set_decrypt_key |
|
864 ___ |
|
865 |
|
866 my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); |
|
867 |
|
868 $code.=<<___; |
|
869 .globl AES_cbc_encrypt |
|
870 .align 32 |
|
871 AES_cbc_encrypt: |
|
872 ld [$key + 240], %g1 |
|
873 nop |
|
874 brz $enc, .Lcbc_decrypt |
|
875 cmp %g1, 12 |
|
876 |
|
877 bl,pt %icc, aes128_t4_cbc_encrypt |
|
878 nop |
|
879 be,pn %icc, aes192_t4_cbc_encrypt |
|
880 nop |
|
881 ba aes256_t4_cbc_encrypt |
|
882 nop |
|
883 |
|
884 .Lcbc_decrypt: |
|
885 bl,pt %icc, aes128_t4_cbc_decrypt |
|
886 nop |
|
887 be,pn %icc, aes192_t4_cbc_decrypt |
|
888 nop |
|
889 ba aes256_t4_cbc_decrypt |
|
890 nop |
|
891 .type AES_cbc_encrypt,#function |
|
892 .size AES_cbc_encrypt,.-AES_cbc_encrypt |
|
893 ___ |
|
894 } |
|
895 $code.=<<___; |
|
896 .asciz "AES for SPARC T4, David S. Miller, Andy Polyakov" |
|
897 .align 4 |
|
898 ___ |
|
899 |
|
900 &emit_assembler(); |
|
901 |
|
902 close STDOUT; |
|