components/openssl/openssl-fips/inline-t4/aest4-sparcv9.pl
changeset 7239 81dd404b35f2
equal deleted inserted replaced
7238:96025c3f5cac 7239:81dd404b35f2
       
     1 #!/usr/bin/env perl
       
     2 
       
     3 # ====================================================================
       
     4 # Written by David S. Miller <[email protected]> and Andy Polyakov
       
     5 # <[email protected]>. The module is licensed under 2-clause BSD
       
     6 # license. October 2012. All rights reserved.
       
     7 # ====================================================================
       
     8 
       
     9 ######################################################################
       
    10 # AES for SPARC T4.
       
    11 #
       
    12 # AES round instructions complete in 3 cycles and can be issued every
       
    13 # cycle. It means that round calculations should take 4*rounds cycles,
       
    14 # because any given round instruction depends on result of *both*
       
    15 # previous instructions:
       
    16 #
       
    17 #	|0 |1 |2 |3 |4
       
    18 #	|01|01|01|
       
    19 #	   |23|23|23|
       
    20 #	            |01|01|...
       
    21 #	               |23|...
       
    22 #
       
    23 # Provided that fxor [with IV] takes 3 cycles to complete, critical
       
    24 # path length for CBC encrypt would be 3+4*rounds, or in other words
       
    25 # it should process one byte in at least (3+4*rounds)/16 cycles. This
       
    26 # estimate doesn't account for "collateral" instructions, such as
       
    27 # fetching input from memory, xor-ing it with zero-round key and
       
    28 # storing the result. Yet, *measured* performance [for data aligned
       
    29 # at 64-bit boundary!] deviates from this equation by less than 0.5%:
       
    30 #
       
    31 #		128-bit key	192-		256-
       
    32 # CBC encrypt	2.70/2.90(*)	3.20/3.40	3.70/3.90
       
    33 #			 (*) numbers after slash are for
       
    34 #			     misaligned data.
       
    35 #
       
    36 # Out-of-order execution logic managed to fully overlap "collateral"
       
    37 # instructions with those on critical path. Amazing!
       
    38 #
       
    39 # As with Intel AES-NI, question is if it's possible to improve
       
    40 # performance of parallelizeable modes by interleaving round
       
    41 # instructions. Provided round instruction latency and throughput
       
    42 # optimal interleave factor is 2. But can we expect 2x performance
       
    43 # improvement? Well, as round instructions can be issued one per
       
    44 # cycle, they don't saturate the 2-way issue pipeline and therefore
       
    45 # there is room for "collateral" calculations... Yet, 2x speed-up
       
    46 # over CBC encrypt remains unattaintable:
       
    47 #
       
    48 #		128-bit key	192-		256-
       
    49 # CBC decrypt	1.64/2.11	1.89/2.37	2.23/2.61
       
    50 # CTR		1.64/2.08(*)	1.89/2.33	2.23/2.61
       
    51 #			 (*) numbers after slash are for
       
    52 #			     misaligned data.
       
    53 #
       
    54 # Estimates based on amount of instructions under assumption that
       
    55 # round instructions are not pairable with any other instruction
       
    56 # suggest that latter is the actual case and pipeline runs
       
    57 # underutilized. It should be noted that T4 out-of-order execution
       
    58 # logic is so capable that performance gain from 2x interleave is
       
    59 # not even impressive, ~7-13% over non-interleaved code, largest
       
    60 # for 256-bit keys.
       
    61 
       
    62 # To anchor to something else, software implementation processes
       
    63 # one byte in 29 cycles with 128-bit key on same processor. Intel
       
    64 # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
       
    65 # in 0.93, naturally with AES-NI.
       
    66 
       
    67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
       
    68 push(@INC,"${dir}","${dir}../../perlasm");
       
    69 require "sparcv9_modes.pl";
       
    70 
       
    71 &asm_init(@ARGV);
       
    72 
       
    73 $::evp=1;	# if $evp is set to 0, script generates module with
       
    74 # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
       
    75 # points. These however are not fully compatible with openssl/aes.h,
       
    76 # because they expect AES_KEY to be aligned at 64-bit boundary. When
       
    77 # used through EVP, alignment is arranged at EVP layer. Second thing
       
    78 # that is arranged by EVP is at least 32-bit alignment of IV.
       
    79 
       
    80 ######################################################################
       
    81 # single-round subroutines
       
    82 #
       
    83 {
       
    84 my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
       
    85 
       
    86 $code.=<<___ if ($::abibits==64);
       
    87 .register	%g2,#scratch
       
    88 .register	%g3,#scratch
       
    89 
       
    90 ___
       
    91 $code.=<<___;
       
    92 #include <openssl/fipssyms.h>
       
    93 
       
    94 .text
       
    95 
       
    96 .globl	aes_t4_encrypt
       
    97 .align	32
       
    98 aes_t4_encrypt:
       
    99 	andcc		$inp, 7, %g1		! is input aligned?
       
   100 	andn		$inp, 7, $inp
       
   101 
       
   102 	ldx		[$key + 0], %g4
       
   103 	ldx		[$key + 8], %g5
       
   104 
       
   105 	ldx		[$inp + 0], %o4
       
   106 	bz,pt		%icc, 1f
       
   107 	ldx		[$inp + 8], %o5
       
   108 	ldx		[$inp + 16], $inp
       
   109 	sll		%g1, 3, %g1
       
   110 	sub		%g0, %g1, %o3
       
   111 	sllx		%o4, %g1, %o4
       
   112 	sllx		%o5, %g1, %g1
       
   113 	srlx		%o5, %o3, %o5
       
   114 	srlx		$inp, %o3, %o3
       
   115 	or		%o5, %o4, %o4
       
   116 	or		%o3, %g1, %o5
       
   117 1:
       
   118 	ld		[$key + 240], $rounds
       
   119 	ldd		[$key + 16], %f12
       
   120 	ldd		[$key + 24], %f14
       
   121 	xor		%g4, %o4, %o4
       
   122 	xor		%g5, %o5, %o5
       
   123 	movxtod		%o4, %f0
       
   124 	movxtod		%o5, %f2
       
   125 	srl		$rounds, 1, $rounds
       
   126 	ldd		[$key + 32], %f16
       
   127 	sub		$rounds, 1, $rounds
       
   128 	ldd		[$key + 40], %f18
       
   129 	add		$key, 48, $key
       
   130 
       
   131 .Lenc:
       
   132 	aes_eround01	%f12, %f0, %f2, %f4
       
   133 	aes_eround23	%f14, %f0, %f2, %f2
       
   134 	ldd		[$key + 0], %f12
       
   135 	ldd		[$key + 8], %f14
       
   136 	sub		$rounds,1,$rounds
       
   137 	aes_eround01	%f16, %f4, %f2, %f0
       
   138 	aes_eround23	%f18, %f4, %f2, %f2
       
   139 	ldd		[$key + 16], %f16
       
   140 	ldd		[$key + 24], %f18
       
   141 	brnz,pt		$rounds, .Lenc
       
   142 	add		$key, 32, $key
       
   143 
       
   144 	andcc		$out, 7, $tmp		! is output aligned?
       
   145 	aes_eround01	%f12, %f0, %f2, %f4
       
   146 	aes_eround23	%f14, %f0, %f2, %f2
       
   147 	aes_eround01_l	%f16, %f4, %f2, %f0
       
   148 	aes_eround23_l	%f18, %f4, %f2, %f2
       
   149 
       
   150 	bnz,pn		%icc, 2f
       
   151 	nop
       
   152 
       
   153 	std		%f0, [$out + 0]
       
   154 	retl
       
   155 	std		%f2, [$out + 8]
       
   156 
       
   157 2:	alignaddrl	$out, %g0, $out
       
   158 	mov		0xff, $mask
       
   159 	srl		$mask, $tmp, $mask
       
   160 
       
   161 	faligndata	%f0, %f0, %f4
       
   162 	faligndata	%f0, %f2, %f6
       
   163 	faligndata	%f2, %f2, %f8
       
   164 
       
   165 	stda		%f4, [$out + $mask]0xc0	! partial store
       
   166 	std		%f6, [$out + 8]
       
   167 	add		$out, 16, $out
       
   168 	orn		%g0, $mask, $mask
       
   169 	retl
       
   170 	stda		%f8, [$out + $mask]0xc0	! partial store
       
   171 .type	aes_t4_encrypt,#function
       
   172 .size	aes_t4_encrypt,.-aes_t4_encrypt
       
   173 
       
   174 .globl	aes_t4_decrypt
       
   175 .align	32
       
   176 aes_t4_decrypt:
       
   177 	andcc		$inp, 7, %g1		! is input aligned?
       
   178 	andn		$inp, 7, $inp
       
   179 
       
   180 	ldx		[$key + 0], %g4
       
   181 	ldx		[$key + 8], %g5
       
   182 
       
   183 	ldx		[$inp + 0], %o4
       
   184 	bz,pt		%icc, 1f
       
   185 	ldx		[$inp + 8], %o5
       
   186 	ldx		[$inp + 16], $inp
       
   187 	sll		%g1, 3, %g1
       
   188 	sub		%g0, %g1, %o3
       
   189 	sllx		%o4, %g1, %o4
       
   190 	sllx		%o5, %g1, %g1
       
   191 	srlx		%o5, %o3, %o5
       
   192 	srlx		$inp, %o3, %o3
       
   193 	or		%o5, %o4, %o4
       
   194 	or		%o3, %g1, %o5
       
   195 1:
       
   196 	ld		[$key + 240], $rounds
       
   197 	ldd		[$key + 16], %f12
       
   198 	ldd		[$key + 24], %f14
       
   199 	xor		%g4, %o4, %o4
       
   200 	xor		%g5, %o5, %o5
       
   201 	movxtod		%o4, %f0
       
   202 	movxtod		%o5, %f2
       
   203 	srl		$rounds, 1, $rounds
       
   204 	ldd		[$key + 32], %f16
       
   205 	sub		$rounds, 1, $rounds
       
   206 	ldd		[$key + 40], %f18
       
   207 	add		$key, 48, $key
       
   208 
       
   209 .Ldec:
       
   210 	aes_dround01	%f12, %f0, %f2, %f4
       
   211 	aes_dround23	%f14, %f0, %f2, %f2
       
   212 	ldd		[$key + 0], %f12
       
   213 	ldd		[$key + 8], %f14
       
   214 	sub		$rounds,1,$rounds
       
   215 	aes_dround01	%f16, %f4, %f2, %f0
       
   216 	aes_dround23	%f18, %f4, %f2, %f2
       
   217 	ldd		[$key + 16], %f16
       
   218 	ldd		[$key + 24], %f18
       
   219 	brnz,pt		$rounds, .Ldec
       
   220 	add		$key, 32, $key
       
   221 
       
   222 	andcc		$out, 7, $tmp		! is output aligned?
       
   223 	aes_dround01	%f12, %f0, %f2, %f4
       
   224 	aes_dround23	%f14, %f0, %f2, %f2
       
   225 	aes_dround01_l	%f16, %f4, %f2, %f0
       
   226 	aes_dround23_l	%f18, %f4, %f2, %f2
       
   227 
       
   228 	bnz,pn		%icc, 2f
       
   229 	nop
       
   230 
       
   231 	std		%f0, [$out + 0]
       
   232 	retl
       
   233 	std		%f2, [$out + 8]
       
   234 
       
   235 2:	alignaddrl	$out, %g0, $out
       
   236 	mov		0xff, $mask
       
   237 	srl		$mask, $tmp, $mask
       
   238 
       
   239 	faligndata	%f0, %f0, %f4
       
   240 	faligndata	%f0, %f2, %f6
       
   241 	faligndata	%f2, %f2, %f8
       
   242 
       
   243 	stda		%f4, [$out + $mask]0xc0	! partial store
       
   244 	std		%f6, [$out + 8]
       
   245 	add		$out, 16, $out
       
   246 	orn		%g0, $mask, $mask
       
   247 	retl
       
   248 	stda		%f8, [$out + $mask]0xc0	! partial store
       
   249 .type	aes_t4_decrypt,#function
       
   250 .size	aes_t4_decrypt,.-aes_t4_decrypt
       
   251 ___
       
   252 }
       
   253 
       
   254 ######################################################################
       
   255 # key setup subroutines
       
   256 #
       
   257 {
       
   258 my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
       
   259 $code.=<<___;
       
   260 .globl	aes_t4_set_encrypt_key
       
   261 .align	32
       
   262 aes_t4_set_encrypt_key:
       
   263 .Lset_encrypt_key:
       
   264 	and		$inp, 7, $tmp
       
   265 	alignaddr	$inp, %g0, $inp
       
   266 	cmp		$bits, 192
       
   267 	ldd		[$inp + 0], %f0
       
   268 	bl,pt		%icc,.L128
       
   269 	ldd		[$inp + 8], %f2
       
   270 
       
   271 	be,pt		%icc,.L192
       
   272 	ldd		[$inp + 16], %f4
       
   273 	brz,pt		$tmp, .L256aligned
       
   274 	ldd		[$inp + 24], %f6
       
   275 
       
   276 	ldd		[$inp + 32], %f8
       
   277 	faligndata	%f0, %f2, %f0
       
   278 	faligndata	%f2, %f4, %f2
       
   279 	faligndata	%f4, %f6, %f4
       
   280 	faligndata	%f6, %f8, %f6
       
   281 .L256aligned:
       
   282 ___
       
   283 for ($i=0; $i<6; $i++) {
       
   284     $code.=<<___;
       
   285 	std		%f0, [$out + `32*$i+0`]
       
   286 	aes_kexpand1	%f0, %f6, $i, %f0
       
   287 	std		%f2, [$out + `32*$i+8`]
       
   288 	aes_kexpand2	%f2, %f0, %f2
       
   289 	std		%f4, [$out + `32*$i+16`]
       
   290 	aes_kexpand0	%f4, %f2, %f4
       
   291 	std		%f6, [$out + `32*$i+24`]
       
   292 	aes_kexpand2	%f6, %f4, %f6
       
   293 ___
       
   294 }
       
   295 $code.=<<___;
       
   296 	std		%f0, [$out + `32*$i+0`]
       
   297 	aes_kexpand1	%f0, %f6, $i, %f0
       
   298 	std		%f2, [$out + `32*$i+8`]
       
   299 	aes_kexpand2	%f2, %f0, %f2
       
   300 	std		%f4, [$out + `32*$i+16`]
       
   301 	std		%f6, [$out + `32*$i+24`]
       
   302 	std		%f0, [$out + `32*$i+32`]
       
   303 	std		%f2, [$out + `32*$i+40`]
       
   304 
       
   305 	mov		14, $tmp
       
   306 	st		$tmp, [$out + 240]
       
   307 	retl
       
   308 	xor		%o0, %o0, %o0
       
   309 
       
   310 .align	16
       
   311 .L192:
       
   312 	brz,pt		$tmp, .L192aligned
       
   313 	nop
       
   314 
       
   315 	ldd		[$inp + 24], %f6
       
   316 	faligndata	%f0, %f2, %f0
       
   317 	faligndata	%f2, %f4, %f2
       
   318 	faligndata	%f4, %f6, %f4
       
   319 .L192aligned:
       
   320 ___
       
   321 for ($i=0; $i<7; $i++) {
       
   322     $code.=<<___;
       
   323 	std		%f0, [$out + `24*$i+0`]
       
   324 	aes_kexpand1	%f0, %f4, $i, %f0
       
   325 	std		%f2, [$out + `24*$i+8`]
       
   326 	aes_kexpand2	%f2, %f0, %f2
       
   327 	std		%f4, [$out + `24*$i+16`]
       
   328 	aes_kexpand2	%f4, %f2, %f4
       
   329 ___
       
   330 }
       
   331 $code.=<<___;
       
   332 	std		%f0, [$out + `24*$i+0`]
       
   333 	aes_kexpand1	%f0, %f4, $i, %f0
       
   334 	std		%f2, [$out + `24*$i+8`]
       
   335 	aes_kexpand2	%f2, %f0, %f2
       
   336 	std		%f4, [$out + `24*$i+16`]
       
   337 	std		%f0, [$out + `24*$i+24`]
       
   338 	std		%f2, [$out + `24*$i+32`]
       
   339 
       
   340 	mov		12, $tmp
       
   341 	st		$tmp, [$out + 240]
       
   342 	retl
       
   343 	xor		%o0, %o0, %o0
       
   344 
       
   345 .align	16
       
   346 .L128:
       
   347 	brz,pt		$tmp, .L128aligned
       
   348 	nop
       
   349 
       
   350 	ldd		[$inp + 16], %f4
       
   351 	faligndata	%f0, %f2, %f0
       
   352 	faligndata	%f2, %f4, %f2
       
   353 .L128aligned:
       
   354 ___
       
   355 for ($i=0; $i<10; $i++) {
       
   356     $code.=<<___;
       
   357 	std		%f0, [$out + `16*$i+0`]
       
   358 	aes_kexpand1	%f0, %f2, $i, %f0
       
   359 	std		%f2, [$out + `16*$i+8`]
       
   360 	aes_kexpand2	%f2, %f0, %f2
       
   361 ___
       
   362 }
       
   363 $code.=<<___;
       
   364 	std		%f0, [$out + `16*$i+0`]
       
   365 	std		%f2, [$out + `16*$i+8`]
       
   366 
       
   367 	mov		10, $tmp
       
   368 	st		$tmp, [$out + 240]
       
   369 	retl
       
   370 	xor		%o0, %o0, %o0
       
   371 .type	aes_t4_set_encrypt_key,#function
       
   372 .size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
       
   373 
       
   374 .globl	aes_t4_set_decrypt_key
       
   375 .align	32
       
   376 aes_t4_set_decrypt_key:
       
   377 	mov		%o7, %o5
       
   378 	call		.Lset_encrypt_key
       
   379 	nop
       
   380 
       
   381 	mov		%o5, %o7
       
   382 	sll		$tmp, 4, $inp		! $tmp is number of rounds
       
   383 	add		$tmp, 2, $tmp
       
   384 	add		$out, $inp, $inp	! $inp=$out+16*rounds
       
   385 	srl		$tmp, 2, $tmp		! $tmp=(rounds+2)/4
       
   386 
       
   387 .Lkey_flip:
       
   388 	ldd		[$out + 0],  %f0
       
   389 	ldd		[$out + 8],  %f2
       
   390 	ldd		[$out + 16], %f4
       
   391 	ldd		[$out + 24], %f6
       
   392 	ldd		[$inp + 0],  %f8
       
   393 	ldd		[$inp + 8],  %f10
       
   394 	ldd		[$inp - 16], %f12
       
   395 	ldd		[$inp - 8],  %f14
       
   396 	sub		$tmp, 1, $tmp
       
   397 	std		%f0, [$inp + 0]
       
   398 	std		%f2, [$inp + 8]
       
   399 	std		%f4, [$inp - 16]
       
   400 	std		%f6, [$inp - 8]
       
   401 	std		%f8, [$out + 0]
       
   402 	std		%f10, [$out + 8]
       
   403 	std		%f12, [$out + 16]
       
   404 	std		%f14, [$out + 24]
       
   405 	add		$out, 32, $out
       
   406 	brnz		$tmp, .Lkey_flip
       
   407 	sub		$inp, 32, $inp
       
   408 
       
   409 	retl
       
   410 	xor		%o0, %o0, %o0
       
   411 .type	aes_t4_set_decrypt_key,#function
       
   412 .size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
       
   413 ___
       
   414 }
       
   415 
       
   416 {{{
       
   417 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
       
   418 my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
       
   419 
       
   420 $code.=<<___;
       
   421 .align	32
       
   422 _aes128_encrypt_1x:
       
   423 ___
       
   424 for ($i=0; $i<4; $i++) {
       
   425     $code.=<<___;
       
   426 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   427 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   428 	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   429 	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   430 ___
       
   431 }
       
   432 $code.=<<___;
       
   433 	aes_eround01	%f48, %f0, %f2, %f4
       
   434 	aes_eround23	%f50, %f0, %f2, %f2
       
   435 	aes_eround01_l	%f52, %f4, %f2, %f0
       
   436 	retl
       
   437 	aes_eround23_l	%f54, %f4, %f2, %f2
       
   438 .type	_aes128_encrypt_1x,#function
       
   439 .size	_aes128_encrypt_1x,.-_aes128_encrypt_1x
       
   440 
       
   441 .align	32
       
   442 _aes128_encrypt_2x:
       
   443 ___
       
   444 for ($i=0; $i<4; $i++) {
       
   445     $code.=<<___;
       
   446 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   447 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   448 	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   449 	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   450 	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   451 	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   452 	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   453 	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   454 ___
       
   455 }
       
   456 $code.=<<___;
       
   457 	aes_eround01	%f48, %f0, %f2, %f8
       
   458 	aes_eround23	%f50, %f0, %f2, %f2
       
   459 	aes_eround01	%f48, %f4, %f6, %f10
       
   460 	aes_eround23	%f50, %f4, %f6, %f6
       
   461 	aes_eround01_l	%f52, %f8, %f2, %f0
       
   462 	aes_eround23_l	%f54, %f8, %f2, %f2
       
   463 	aes_eround01_l	%f52, %f10, %f6, %f4
       
   464 	retl
       
   465 	aes_eround23_l	%f54, %f10, %f6, %f6
       
   466 .type	_aes128_encrypt_2x,#function
       
   467 .size	_aes128_encrypt_2x,.-_aes128_encrypt_2x
       
   468 
       
   469 .align	32
       
   470 _aes128_loadkey:
       
   471 	ldx		[$key + 0], %g4
       
   472 	ldx		[$key + 8], %g5
       
   473 ___
       
   474 for ($i=2; $i<22;$i++) {			# load key schedule
       
   475     $code.=<<___;
       
   476 	ldd		[$key + `8*$i`], %f`12+2*$i`
       
   477 ___
       
   478 }
       
   479 $code.=<<___;
       
   480 	retl
       
   481 	nop
       
   482 .type	_aes128_loadkey,#function
       
   483 .size	_aes128_loadkey,.-_aes128_loadkey
       
   484 _aes128_load_enckey=_aes128_loadkey
       
   485 _aes128_load_deckey=_aes128_loadkey
       
   486 
       
   487 ___
       
   488 
       
   489 &alg_cbc_encrypt_implement("aes",128);
       
   490 if ($::evp) {
       
   491     &alg_ctr32_implement("aes",128);
       
   492     &alg_xts_implement("aes",128,"en");
       
   493     &alg_xts_implement("aes",128,"de");
       
   494 }
       
   495 &alg_cbc_decrypt_implement("aes",128);
       
   496 
       
   497 $code.=<<___;
       
   498 .align	32
       
   499 _aes128_decrypt_1x:
       
   500 ___
       
   501 for ($i=0; $i<4; $i++) {
       
   502     $code.=<<___;
       
   503 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   504 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   505 	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   506 	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   507 ___
       
   508 }
       
   509 $code.=<<___;
       
   510 	aes_dround01	%f48, %f0, %f2, %f4
       
   511 	aes_dround23	%f50, %f0, %f2, %f2
       
   512 	aes_dround01_l	%f52, %f4, %f2, %f0
       
   513 	retl
       
   514 	aes_dround23_l	%f54, %f4, %f2, %f2
       
   515 .type	_aes128_decrypt_1x,#function
       
   516 .size	_aes128_decrypt_1x,.-_aes128_decrypt_1x
       
   517 
       
   518 .align	32
       
   519 _aes128_decrypt_2x:
       
   520 ___
       
   521 for ($i=0; $i<4; $i++) {
       
   522     $code.=<<___;
       
   523 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   524 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   525 	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   526 	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   527 	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   528 	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   529 	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   530 	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   531 ___
       
   532 }
       
   533 $code.=<<___;
       
   534 	aes_dround01	%f48, %f0, %f2, %f8
       
   535 	aes_dround23	%f50, %f0, %f2, %f2
       
   536 	aes_dround01	%f48, %f4, %f6, %f10
       
   537 	aes_dround23	%f50, %f4, %f6, %f6
       
   538 	aes_dround01_l	%f52, %f8, %f2, %f0
       
   539 	aes_dround23_l	%f54, %f8, %f2, %f2
       
   540 	aes_dround01_l	%f52, %f10, %f6, %f4
       
   541 	retl
       
   542 	aes_dround23_l	%f54, %f10, %f6, %f6
       
   543 .type	_aes128_decrypt_2x,#function
       
   544 .size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
       
   545 ___
       
   546 
       
   547 $code.=<<___;
       
   548 .align	32
       
   549 _aes192_encrypt_1x:
       
   550 ___
       
   551 for ($i=0; $i<5; $i++) {
       
   552     $code.=<<___;
       
   553 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   554 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   555 	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   556 	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   557 ___
       
   558 }
       
   559 $code.=<<___;
       
   560 	aes_eround01	%f56, %f0, %f2, %f4
       
   561 	aes_eround23	%f58, %f0, %f2, %f2
       
   562 	aes_eround01_l	%f60, %f4, %f2, %f0
       
   563 	retl
       
   564 	aes_eround23_l	%f62, %f4, %f2, %f2
       
   565 .type	_aes192_encrypt_1x,#function
       
   566 .size	_aes192_encrypt_1x,.-_aes192_encrypt_1x
       
   567 
       
   568 .align	32
       
   569 _aes192_encrypt_2x:
       
   570 ___
       
   571 for ($i=0; $i<5; $i++) {
       
   572     $code.=<<___;
       
   573 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   574 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   575 	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   576 	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   577 	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   578 	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   579 	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   580 	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   581 ___
       
   582 }
       
   583 $code.=<<___;
       
   584 	aes_eround01	%f56, %f0, %f2, %f8
       
   585 	aes_eround23	%f58, %f0, %f2, %f2
       
   586 	aes_eround01	%f56, %f4, %f6, %f10
       
   587 	aes_eround23	%f58, %f4, %f6, %f6
       
   588 	aes_eround01_l	%f60, %f8, %f2, %f0
       
   589 	aes_eround23_l	%f62, %f8, %f2, %f2
       
   590 	aes_eround01_l	%f60, %f10, %f6, %f4
       
   591 	retl
       
   592 	aes_eround23_l	%f62, %f10, %f6, %f6
       
   593 .type	_aes192_encrypt_2x,#function
       
   594 .size	_aes192_encrypt_2x,.-_aes192_encrypt_2x
       
   595 
       
   596 .align	32
       
   597 _aes256_encrypt_1x:
       
   598 	aes_eround01	%f16, %f0, %f2, %f4
       
   599 	aes_eround23	%f18, %f0, %f2, %f2
       
   600 	ldd		[$key + 208], %f16
       
   601 	ldd		[$key + 216], %f18
       
   602 	aes_eround01	%f20, %f4, %f2, %f0
       
   603 	aes_eround23	%f22, %f4, %f2, %f2
       
   604 	ldd		[$key + 224], %f20
       
   605 	ldd		[$key + 232], %f22
       
   606 ___
       
   607 for ($i=1; $i<6; $i++) {
       
   608     $code.=<<___;
       
   609 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   610 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   611 	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   612 	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   613 ___
       
   614 }
       
   615 $code.=<<___;
       
   616 	aes_eround01	%f16, %f0, %f2, %f4
       
   617 	aes_eround23	%f18, %f0, %f2, %f2
       
   618 	ldd		[$key + 16], %f16
       
   619 	ldd		[$key + 24], %f18
       
   620 	aes_eround01_l	%f20, %f4, %f2, %f0
       
   621 	aes_eround23_l	%f22, %f4, %f2, %f2
       
   622 	ldd		[$key + 32], %f20
       
   623 	retl
       
   624 	ldd		[$key + 40], %f22
       
   625 .type	_aes256_encrypt_1x,#function
       
   626 .size	_aes256_encrypt_1x,.-_aes256_encrypt_1x
       
   627 
       
   628 .align	32
       
   629 _aes256_encrypt_2x:
       
   630 	aes_eround01	%f16, %f0, %f2, %f8
       
   631 	aes_eround23	%f18, %f0, %f2, %f2
       
   632 	aes_eround01	%f16, %f4, %f6, %f10
       
   633 	aes_eround23	%f18, %f4, %f6, %f6
       
   634 	ldd		[$key + 208], %f16
       
   635 	ldd		[$key + 216], %f18
       
   636 	aes_eround01	%f20, %f8, %f2, %f0
       
   637 	aes_eround23	%f22, %f8, %f2, %f2
       
   638 	aes_eround01	%f20, %f10, %f6, %f4
       
   639 	aes_eround23	%f22, %f10, %f6, %f6
       
   640 	ldd		[$key + 224], %f20
       
   641 	ldd		[$key + 232], %f22
       
   642 ___
       
   643 for ($i=1; $i<6; $i++) {
       
   644     $code.=<<___;
       
   645 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   646 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   647 	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   648 	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   649 	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   650 	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   651 	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   652 	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   653 ___
       
   654 }
       
   655 $code.=<<___;
       
   656 	aes_eround01	%f16, %f0, %f2, %f8
       
   657 	aes_eround23	%f18, %f0, %f2, %f2
       
   658 	aes_eround01	%f16, %f4, %f6, %f10
       
   659 	aes_eround23	%f18, %f4, %f6, %f6
       
   660 	ldd		[$key + 16], %f16
       
   661 	ldd		[$key + 24], %f18
       
   662 	aes_eround01_l	%f20, %f8, %f2, %f0
       
   663 	aes_eround23_l	%f22, %f8, %f2, %f2
       
   664 	aes_eround01_l	%f20, %f10, %f6, %f4
       
   665 	aes_eround23_l	%f22, %f10, %f6, %f6
       
   666 	ldd		[$key + 32], %f20
       
   667 	retl
       
   668 	ldd		[$key + 40], %f22
       
   669 .type	_aes256_encrypt_2x,#function
       
   670 .size	_aes256_encrypt_2x,.-_aes256_encrypt_2x
       
   671 
       
   672 .align	32
       
   673 _aes192_loadkey:
       
   674 	ldx		[$key + 0], %g4
       
   675 	ldx		[$key + 8], %g5
       
   676 ___
       
   677 for ($i=2; $i<26;$i++) {			# load key schedule
       
   678     $code.=<<___;
       
   679 	ldd		[$key + `8*$i`], %f`12+2*$i`
       
   680 ___
       
   681 }
       
   682 $code.=<<___;
       
   683 	retl
       
   684 	nop
       
   685 .type	_aes192_loadkey,#function
       
   686 .size	_aes192_loadkey,.-_aes192_loadkey
       
   687 _aes256_loadkey=_aes192_loadkey
       
   688 _aes192_load_enckey=_aes192_loadkey
       
   689 _aes192_load_deckey=_aes192_loadkey
       
   690 _aes256_load_enckey=_aes192_loadkey
       
   691 _aes256_load_deckey=_aes192_loadkey
       
   692 ___
       
   693 
       
   694 &alg_cbc_encrypt_implement("aes",256);
       
   695 &alg_cbc_encrypt_implement("aes",192);
       
   696 if ($::evp) {
       
   697     &alg_ctr32_implement("aes",256);
       
   698     &alg_xts_implement("aes",256,"en");
       
   699     &alg_xts_implement("aes",256,"de");
       
   700     &alg_ctr32_implement("aes",192);
       
   701 }
       
   702 &alg_cbc_decrypt_implement("aes",192);
       
   703 &alg_cbc_decrypt_implement("aes",256);
       
   704 
       
   705 $code.=<<___;
       
   706 .align	32
       
   707 _aes256_decrypt_1x:
       
   708 	aes_dround01	%f16, %f0, %f2, %f4
       
   709 	aes_dround23	%f18, %f0, %f2, %f2
       
   710 	ldd		[$key + 208], %f16
       
   711 	ldd		[$key + 216], %f18
       
   712 	aes_dround01	%f20, %f4, %f2, %f0
       
   713 	aes_dround23	%f22, %f4, %f2, %f2
       
   714 	ldd		[$key + 224], %f20
       
   715 	ldd		[$key + 232], %f22
       
   716 ___
       
   717 for ($i=1; $i<6; $i++) {
       
   718     $code.=<<___;
       
   719 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   720 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   721 	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   722 	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   723 ___
       
   724 }
       
   725 $code.=<<___;
       
   726 	aes_dround01	%f16, %f0, %f2, %f4
       
   727 	aes_dround23	%f18, %f0, %f2, %f2
       
   728 	ldd		[$key + 16], %f16
       
   729 	ldd		[$key + 24], %f18
       
   730 	aes_dround01_l	%f20, %f4, %f2, %f0
       
   731 	aes_dround23_l	%f22, %f4, %f2, %f2
       
   732 	ldd		[$key + 32], %f20
       
   733 	retl
       
   734 	ldd		[$key + 40], %f22
       
   735 .type	_aes256_decrypt_1x,#function
       
   736 .size	_aes256_decrypt_1x,.-_aes256_decrypt_1x
       
   737 
       
   738 .align	32
       
   739 _aes256_decrypt_2x:
       
   740 	aes_dround01	%f16, %f0, %f2, %f8
       
   741 	aes_dround23	%f18, %f0, %f2, %f2
       
   742 	aes_dround01	%f16, %f4, %f6, %f10
       
   743 	aes_dround23	%f18, %f4, %f6, %f6
       
   744 	ldd		[$key + 208], %f16
       
   745 	ldd		[$key + 216], %f18
       
   746 	aes_dround01	%f20, %f8, %f2, %f0
       
   747 	aes_dround23	%f22, %f8, %f2, %f2
       
   748 	aes_dround01	%f20, %f10, %f6, %f4
       
   749 	aes_dround23	%f22, %f10, %f6, %f6
       
   750 	ldd		[$key + 224], %f20
       
   751 	ldd		[$key + 232], %f22
       
   752 ___
       
   753 for ($i=1; $i<6; $i++) {
       
   754     $code.=<<___;
       
   755 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   756 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   757 	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   758 	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   759 	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   760 	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   761 	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   762 	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   763 ___
       
   764 }
       
   765 $code.=<<___;
       
   766 	aes_dround01	%f16, %f0, %f2, %f8
       
   767 	aes_dround23	%f18, %f0, %f2, %f2
       
   768 	aes_dround01	%f16, %f4, %f6, %f10
       
   769 	aes_dround23	%f18, %f4, %f6, %f6
       
   770 	ldd		[$key + 16], %f16
       
   771 	ldd		[$key + 24], %f18
       
   772 	aes_dround01_l	%f20, %f8, %f2, %f0
       
   773 	aes_dround23_l	%f22, %f8, %f2, %f2
       
   774 	aes_dround01_l	%f20, %f10, %f6, %f4
       
   775 	aes_dround23_l	%f22, %f10, %f6, %f6
       
   776 	ldd		[$key + 32], %f20
       
   777 	retl
       
   778 	ldd		[$key + 40], %f22
       
   779 .type	_aes256_decrypt_2x,#function
       
   780 .size	_aes256_decrypt_2x,.-_aes256_decrypt_2x
       
   781 
       
   782 .align	32
       
   783 _aes192_decrypt_1x:
       
   784 ___
       
   785 for ($i=0; $i<5; $i++) {
       
   786     $code.=<<___;
       
   787 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   788 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   789 	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   790 	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   791 ___
       
   792 }
       
   793 $code.=<<___;
       
   794 	aes_dround01	%f56, %f0, %f2, %f4
       
   795 	aes_dround23	%f58, %f0, %f2, %f2
       
   796 	aes_dround01_l	%f60, %f4, %f2, %f0
       
   797 	retl
       
   798 	aes_dround23_l	%f62, %f4, %f2, %f2
       
   799 .type	_aes192_decrypt_1x,#function
       
   800 .size	_aes192_decrypt_1x,.-_aes192_decrypt_1x
       
   801 
       
   802 .align	32
       
   803 _aes192_decrypt_2x:
       
   804 ___
       
   805 for ($i=0; $i<5; $i++) {
       
   806     $code.=<<___;
       
   807 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   808 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   809 	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   810 	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   811 	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   812 	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   813 	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   814 	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   815 ___
       
   816 }
       
   817 $code.=<<___;
       
   818 	aes_dround01	%f56, %f0, %f2, %f8
       
   819 	aes_dround23	%f58, %f0, %f2, %f2
       
   820 	aes_dround01	%f56, %f4, %f6, %f10
       
   821 	aes_dround23	%f58, %f4, %f6, %f6
       
   822 	aes_dround01_l	%f60, %f8, %f2, %f0
       
   823 	aes_dround23_l	%f62, %f8, %f2, %f2
       
   824 	aes_dround01_l	%f60, %f10, %f6, %f4
       
   825 	retl
       
   826 	aes_dround23_l	%f62, %f10, %f6, %f6
       
   827 .type	_aes192_decrypt_2x,#function
       
   828 .size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
       
   829 ___
       
   830 }}}
       
   831 
       
   832 if (!$::evp) {
       
   833 $code.=<<___;
       
   834 .global	AES_encrypt
       
   835 AES_encrypt=aes_t4_encrypt
       
   836 .global	AES_decrypt
       
   837 AES_decrypt=aes_t4_decrypt
       
   838 .global	AES_set_encrypt_key
       
   839 .align	32
       
   840 AES_set_encrypt_key:
       
   841 	andcc		%o2, 7, %g0		! check alignment
       
   842 	bnz,a,pn	%icc, 1f
       
   843 	mov		-1, %o0
       
   844 	brz,a,pn	%o0, 1f
       
   845 	mov		-1, %o0
       
   846 	brz,a,pn	%o2, 1f
       
   847 	mov		-1, %o0
       
   848 	andncc		%o1, 0x1c0, %g0
       
   849 	bnz,a,pn	%icc, 1f
       
   850 	mov		-2, %o0
       
   851 	cmp		%o1, 128
       
   852 	bl,a,pn		%icc, 1f
       
   853 	mov		-2, %o0
       
   854 	b		aes_t4_set_encrypt_key
       
   855 	nop
       
   856 1:	retl
       
   857 	nop
       
   858 .type	AES_set_encrypt_key,#function
       
   859 .size	AES_set_encrypt_key,.-AES_set_encrypt_key
       
   860 
       
   861 .global	AES_set_decrypt_key
       
   862 .align	32
       
   863 AES_set_decrypt_key:
       
   864 	andcc		%o2, 7, %g0		! check alignment
       
   865 	bnz,a,pn	%icc, 1f
       
   866 	mov		-1, %o0
       
   867 	brz,a,pn	%o0, 1f
       
   868 	mov		-1, %o0
       
   869 	brz,a,pn	%o2, 1f
       
   870 	mov		-1, %o0
       
   871 	andncc		%o1, 0x1c0, %g0
       
   872 	bnz,a,pn	%icc, 1f
       
   873 	mov		-2, %o0
       
   874 	cmp		%o1, 128
       
   875 	bl,a,pn		%icc, 1f
       
   876 	mov		-2, %o0
       
   877 	b		aes_t4_set_decrypt_key
       
   878 	nop
       
   879 1:	retl
       
   880 	nop
       
   881 .type	AES_set_decrypt_key,#function
       
   882 .size	AES_set_decrypt_key,.-AES_set_decrypt_key
       
   883 ___
       
   884 
       
   885 my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
       
   886 
       
   887 $code.=<<___;
       
   888 .globl	AES_cbc_encrypt
       
   889 .align	32
       
   890 AES_cbc_encrypt:
       
   891 	ld		[$key + 240], %g1
       
   892 	nop
       
   893 	brz		$enc, .Lcbc_decrypt
       
   894 	cmp		%g1, 12
       
   895 
       
   896 	bl,pt		%icc, aes128_t4_cbc_encrypt
       
   897 	nop
       
   898 	be,pn		%icc, aes192_t4_cbc_encrypt
       
   899 	nop
       
   900 	ba		aes256_t4_cbc_encrypt
       
   901 	nop
       
   902 
       
   903 .Lcbc_decrypt:
       
   904 	bl,pt		%icc, aes128_t4_cbc_decrypt
       
   905 	nop
       
   906 	be,pn		%icc, aes192_t4_cbc_decrypt
       
   907 	nop
       
   908 	ba		aes256_t4_cbc_decrypt
       
   909 	nop
       
   910 .type	AES_cbc_encrypt,#function
       
   911 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
       
   912 ___
       
   913 }
       
   914 $code.=<<___;
       
   915 .asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
       
   916 .align	4
       
   917 ___
       
   918 
       
   919 &emit_assembler();
       
   920 
       
   921 close STDOUT;