components/openssl/openssl-1.0.1/inline-t4/aest4-sparcv9.pl
changeset 4822 1fb8a14c6702
parent 4821 54dafbe33fdb
child 4823 3ef8b7f4d9d8
equal deleted inserted replaced
4821:54dafbe33fdb 4822:1fb8a14c6702
     1 #!/usr/bin/env perl
       
     2 
       
     3 # ====================================================================
       
     4 # Written by David S. Miller <[email protected]> and Andy Polyakov
       
     5 # <[email protected]>. The module is licensed under 2-clause BSD
       
     6 # license. October 2012. All rights reserved.
       
     7 # ====================================================================
       
     8 
       
     9 ######################################################################
       
    10 # AES for SPARC T4.
       
    11 #
       
    12 # AES round instructions complete in 3 cycles and can be issued every
       
    13 # cycle. It means that round calculations should take 4*rounds cycles,
       
    14 # because any given round instruction depends on result of *both*
       
    15 # previous instructions:
       
    16 #
       
    17 #	|0 |1 |2 |3 |4
       
    18 #	|01|01|01|
       
    19 #	   |23|23|23|
       
    20 #	            |01|01|...
       
    21 #	               |23|...
       
    22 #
       
    23 # Provided that fxor [with IV] takes 3 cycles to complete, critical
       
    24 # path length for CBC encrypt would be 3+4*rounds, or in other words
       
    25 # it should process one byte in at least (3+4*rounds)/16 cycles. This
       
    26 # estimate doesn't account for "collateral" instructions, such as
       
    27 # fetching input from memory, xor-ing it with zero-round key and
       
    28 # storing the result. Yet, *measured* performance [for data aligned
       
    29 # at 64-bit boundary!] deviates from this equation by less than 0.5%:
       
    30 #
       
    31 #		128-bit key	192-		256-
       
    32 # CBC encrypt	2.70/2.90(*)	3.20/3.40	3.70/3.90
       
    33 #			 (*) numbers after slash are for
       
    34 #			     misaligned data.
       
    35 #
       
    36 # Out-of-order execution logic managed to fully overlap "collateral"
       
    37 # instructions with those on critical path. Amazing!
       
    38 #
       
    39 # As with Intel AES-NI, question is if it's possible to improve
       
    40 # performance of parallelizeable modes by interleaving round
       
    41 # instructions. Provided round instruction latency and throughput
       
    42 # optimal interleave factor is 2. But can we expect 2x performance
       
    43 # improvement? Well, as round instructions can be issued one per
       
    44 # cycle, they don't saturate the 2-way issue pipeline and therefore
       
    45 # there is room for "collateral" calculations... Yet, 2x speed-up
       
    46 # over CBC encrypt remains unattaintable:
       
    47 #
       
    48 #		128-bit key	192-		256-
       
    49 # CBC decrypt	1.64/2.11	1.89/2.37	2.23/2.61
       
    50 # CTR		1.64/2.08(*)	1.89/2.33	2.23/2.61
       
    51 #			 (*) numbers after slash are for
       
    52 #			     misaligned data.
       
    53 #
       
    54 # Estimates based on amount of instructions under assumption that
       
    55 # round instructions are not pairable with any other instruction
       
    56 # suggest that latter is the actual case and pipeline runs
       
    57 # underutilized. It should be noted that T4 out-of-order execution
       
    58 # logic is so capable that performance gain from 2x interleave is
       
    59 # not even impressive, ~7-13% over non-interleaved code, largest
       
    60 # for 256-bit keys.
       
    61 
       
    62 # To anchor to something else, software implementation processes
       
    63 # one byte in 29 cycles with 128-bit key on same processor. Intel
       
    64 # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
       
    65 # in 0.93, naturally with AES-NI.
       
    66 
       
    67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
       
    68 push(@INC,"${dir}","${dir}../../perlasm");
       
    69 require "sparcv9_modes.pl";
       
    70 
       
    71 &asm_init(@ARGV);
       
    72 
       
    73 $::evp=1;	# if $evp is set to 0, script generates module with
       
    74 # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
       
    75 # points. These however are not fully compatible with openssl/aes.h,
       
    76 # because they expect AES_KEY to be aligned at 64-bit boundary. When
       
    77 # used through EVP, alignment is arranged at EVP layer. Second thing
       
    78 # that is arranged by EVP is at least 32-bit alignment of IV.
       
    79 
       
    80 ######################################################################
       
    81 # single-round subroutines
       
    82 #
       
    83 {
       
    84 my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
       
    85 
       
    86 $code=<<___;
       
    87 .text
       
    88 
       
    89 .globl	aes_t4_encrypt
       
    90 .align	32
       
    91 aes_t4_encrypt:
       
    92 	andcc		$inp, 7, %g1		! is input aligned?
       
    93 	andn		$inp, 7, $inp
       
    94 
       
    95 	ldx		[$key + 0], %g4
       
    96 	ldx		[$key + 8], %g5
       
    97 
       
    98 	ldx		[$inp + 0], %o4
       
    99 	bz,pt		%icc, 1f
       
   100 	ldx		[$inp + 8], %o5
       
   101 	ldx		[$inp + 16], $inp
       
   102 	sll		%g1, 3, %g1
       
   103 	sub		%g0, %g1, %o3
       
   104 	sllx		%o4, %g1, %o4
       
   105 	sllx		%o5, %g1, %g1
       
   106 	srlx		%o5, %o3, %o5
       
   107 	srlx		$inp, %o3, %o3
       
   108 	or		%o5, %o4, %o4
       
   109 	or		%o3, %g1, %o5
       
   110 1:
       
   111 	ld		[$key + 240], $rounds
       
   112 	ldd		[$key + 16], %f12
       
   113 	ldd		[$key + 24], %f14
       
   114 	xor		%g4, %o4, %o4
       
   115 	xor		%g5, %o5, %o5
       
   116 	movxtod		%o4, %f0
       
   117 	movxtod		%o5, %f2
       
   118 	srl		$rounds, 1, $rounds
       
   119 	ldd		[$key + 32], %f16
       
   120 	sub		$rounds, 1, $rounds
       
   121 	ldd		[$key + 40], %f18
       
   122 	add		$key, 48, $key
       
   123 
       
   124 .Lenc:
       
   125 	aes_eround01	%f12, %f0, %f2, %f4
       
   126 	aes_eround23	%f14, %f0, %f2, %f2
       
   127 	ldd		[$key + 0], %f12
       
   128 	ldd		[$key + 8], %f14
       
   129 	sub		$rounds,1,$rounds
       
   130 	aes_eround01	%f16, %f4, %f2, %f0
       
   131 	aes_eround23	%f18, %f4, %f2, %f2
       
   132 	ldd		[$key + 16], %f16
       
   133 	ldd		[$key + 24], %f18
       
   134 	brnz,pt		$rounds, .Lenc
       
   135 	add		$key, 32, $key
       
   136 
       
   137 	andcc		$out, 7, $tmp		! is output aligned?
       
   138 	aes_eround01	%f12, %f0, %f2, %f4
       
   139 	aes_eround23	%f14, %f0, %f2, %f2
       
   140 	aes_eround01_l	%f16, %f4, %f2, %f0
       
   141 	aes_eround23_l	%f18, %f4, %f2, %f2
       
   142 
       
   143 	bnz,pn		%icc, 2f
       
   144 	nop
       
   145 
       
   146 	std		%f0, [$out + 0]
       
   147 	retl
       
   148 	std		%f2, [$out + 8]
       
   149 
       
   150 2:	alignaddrl	$out, %g0, $out
       
   151 	mov		0xff, $mask
       
   152 	srl		$mask, $tmp, $mask
       
   153 
       
   154 	faligndata	%f0, %f0, %f4
       
   155 	faligndata	%f0, %f2, %f6
       
   156 	faligndata	%f2, %f2, %f8
       
   157 
       
   158 	stda		%f4, [$out + $mask]0xc0	! partial store
       
   159 	std		%f6, [$out + 8]
       
   160 	add		$out, 16, $out
       
   161 	orn		%g0, $mask, $mask
       
   162 	retl
       
   163 	stda		%f8, [$out + $mask]0xc0	! partial store
       
   164 .type	aes_t4_encrypt,#function
       
   165 .size	aes_t4_encrypt,.-aes_t4_encrypt
       
   166 
       
   167 .globl	aes_t4_decrypt
       
   168 .align	32
       
   169 aes_t4_decrypt:
       
   170 	andcc		$inp, 7, %g1		! is input aligned?
       
   171 	andn		$inp, 7, $inp
       
   172 
       
   173 	ldx		[$key + 0], %g4
       
   174 	ldx		[$key + 8], %g5
       
   175 
       
   176 	ldx		[$inp + 0], %o4
       
   177 	bz,pt		%icc, 1f
       
   178 	ldx		[$inp + 8], %o5
       
   179 	ldx		[$inp + 16], $inp
       
   180 	sll		%g1, 3, %g1
       
   181 	sub		%g0, %g1, %o3
       
   182 	sllx		%o4, %g1, %o4
       
   183 	sllx		%o5, %g1, %g1
       
   184 	srlx		%o5, %o3, %o5
       
   185 	srlx		$inp, %o3, %o3
       
   186 	or		%o5, %o4, %o4
       
   187 	or		%o3, %g1, %o5
       
   188 1:
       
   189 	ld		[$key + 240], $rounds
       
   190 	ldd		[$key + 16], %f12
       
   191 	ldd		[$key + 24], %f14
       
   192 	xor		%g4, %o4, %o4
       
   193 	xor		%g5, %o5, %o5
       
   194 	movxtod		%o4, %f0
       
   195 	movxtod		%o5, %f2
       
   196 	srl		$rounds, 1, $rounds
       
   197 	ldd		[$key + 32], %f16
       
   198 	sub		$rounds, 1, $rounds
       
   199 	ldd		[$key + 40], %f18
       
   200 	add		$key, 48, $key
       
   201 
       
   202 .Ldec:
       
   203 	aes_dround01	%f12, %f0, %f2, %f4
       
   204 	aes_dround23	%f14, %f0, %f2, %f2
       
   205 	ldd		[$key + 0], %f12
       
   206 	ldd		[$key + 8], %f14
       
   207 	sub		$rounds,1,$rounds
       
   208 	aes_dround01	%f16, %f4, %f2, %f0
       
   209 	aes_dround23	%f18, %f4, %f2, %f2
       
   210 	ldd		[$key + 16], %f16
       
   211 	ldd		[$key + 24], %f18
       
   212 	brnz,pt		$rounds, .Ldec
       
   213 	add		$key, 32, $key
       
   214 
       
   215 	andcc		$out, 7, $tmp		! is output aligned?
       
   216 	aes_dround01	%f12, %f0, %f2, %f4
       
   217 	aes_dround23	%f14, %f0, %f2, %f2
       
   218 	aes_dround01_l	%f16, %f4, %f2, %f0
       
   219 	aes_dround23_l	%f18, %f4, %f2, %f2
       
   220 
       
   221 	bnz,pn		%icc, 2f
       
   222 	nop
       
   223 
       
   224 	std		%f0, [$out + 0]
       
   225 	retl
       
   226 	std		%f2, [$out + 8]
       
   227 
       
   228 2:	alignaddrl	$out, %g0, $out
       
   229 	mov		0xff, $mask
       
   230 	srl		$mask, $tmp, $mask
       
   231 
       
   232 	faligndata	%f0, %f0, %f4
       
   233 	faligndata	%f0, %f2, %f6
       
   234 	faligndata	%f2, %f2, %f8
       
   235 
       
   236 	stda		%f4, [$out + $mask]0xc0	! partial store
       
   237 	std		%f6, [$out + 8]
       
   238 	add		$out, 16, $out
       
   239 	orn		%g0, $mask, $mask
       
   240 	retl
       
   241 	stda		%f8, [$out + $mask]0xc0	! partial store
       
   242 .type	aes_t4_decrypt,#function
       
   243 .size	aes_t4_decrypt,.-aes_t4_decrypt
       
   244 ___
       
   245 }
       
   246 
       
   247 ######################################################################
       
   248 # key setup subroutines
       
   249 #
       
   250 {
       
   251 my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
       
   252 $code.=<<___;
       
   253 .globl	aes_t4_set_encrypt_key
       
   254 .align	32
       
   255 aes_t4_set_encrypt_key:
       
   256 .Lset_encrypt_key:
       
   257 	and		$inp, 7, $tmp
       
   258 	alignaddr	$inp, %g0, $inp
       
   259 	cmp		$bits, 192
       
   260 	ldd		[$inp + 0], %f0
       
   261 	bl,pt		%icc,.L128
       
   262 	ldd		[$inp + 8], %f2
       
   263 
       
   264 	be,pt		%icc,.L192
       
   265 	ldd		[$inp + 16], %f4
       
   266 	brz,pt		$tmp, .L256aligned
       
   267 	ldd		[$inp + 24], %f6
       
   268 
       
   269 	ldd		[$inp + 32], %f8
       
   270 	faligndata	%f0, %f2, %f0
       
   271 	faligndata	%f2, %f4, %f2
       
   272 	faligndata	%f4, %f6, %f4
       
   273 	faligndata	%f6, %f8, %f6
       
   274 .L256aligned:
       
   275 ___
       
   276 for ($i=0; $i<6; $i++) {
       
   277     $code.=<<___;
       
   278 	std		%f0, [$out + `32*$i+0`]
       
   279 	aes_kexpand1	%f0, %f6, $i, %f0
       
   280 	std		%f2, [$out + `32*$i+8`]
       
   281 	aes_kexpand2	%f2, %f0, %f2
       
   282 	std		%f4, [$out + `32*$i+16`]
       
   283 	aes_kexpand0	%f4, %f2, %f4
       
   284 	std		%f6, [$out + `32*$i+24`]
       
   285 	aes_kexpand2	%f6, %f4, %f6
       
   286 ___
       
   287 }
       
   288 $code.=<<___;
       
   289 	std		%f0, [$out + `32*$i+0`]
       
   290 	aes_kexpand1	%f0, %f6, $i, %f0
       
   291 	std		%f2, [$out + `32*$i+8`]
       
   292 	aes_kexpand2	%f2, %f0, %f2
       
   293 	std		%f4, [$out + `32*$i+16`]
       
   294 	std		%f6, [$out + `32*$i+24`]
       
   295 	std		%f0, [$out + `32*$i+32`]
       
   296 	std		%f2, [$out + `32*$i+40`]
       
   297 
       
   298 	mov		14, $tmp
       
   299 	st		$tmp, [$out + 240]
       
   300 	retl
       
   301 	xor		%o0, %o0, %o0
       
   302 
       
   303 .align	16
       
   304 .L192:
       
   305 	brz,pt		$tmp, .L192aligned
       
   306 	nop
       
   307 
       
   308 	ldd		[$inp + 24], %f6
       
   309 	faligndata	%f0, %f2, %f0
       
   310 	faligndata	%f2, %f4, %f2
       
   311 	faligndata	%f4, %f6, %f4
       
   312 .L192aligned:
       
   313 ___
       
   314 for ($i=0; $i<7; $i++) {
       
   315     $code.=<<___;
       
   316 	std		%f0, [$out + `24*$i+0`]
       
   317 	aes_kexpand1	%f0, %f4, $i, %f0
       
   318 	std		%f2, [$out + `24*$i+8`]
       
   319 	aes_kexpand2	%f2, %f0, %f2
       
   320 	std		%f4, [$out + `24*$i+16`]
       
   321 	aes_kexpand2	%f4, %f2, %f4
       
   322 ___
       
   323 }
       
   324 $code.=<<___;
       
   325 	std		%f0, [$out + `24*$i+0`]
       
   326 	aes_kexpand1	%f0, %f4, $i, %f0
       
   327 	std		%f2, [$out + `24*$i+8`]
       
   328 	aes_kexpand2	%f2, %f0, %f2
       
   329 	std		%f4, [$out + `24*$i+16`]
       
   330 	std		%f0, [$out + `24*$i+24`]
       
   331 	std		%f2, [$out + `24*$i+32`]
       
   332 
       
   333 	mov		12, $tmp
       
   334 	st		$tmp, [$out + 240]
       
   335 	retl
       
   336 	xor		%o0, %o0, %o0
       
   337 
       
   338 .align	16
       
   339 .L128:
       
   340 	brz,pt		$tmp, .L128aligned
       
   341 	nop
       
   342 
       
   343 	ldd		[$inp + 16], %f4
       
   344 	faligndata	%f0, %f2, %f0
       
   345 	faligndata	%f2, %f4, %f2
       
   346 .L128aligned:
       
   347 ___
       
   348 for ($i=0; $i<10; $i++) {
       
   349     $code.=<<___;
       
   350 	std		%f0, [$out + `16*$i+0`]
       
   351 	aes_kexpand1	%f0, %f2, $i, %f0
       
   352 	std		%f2, [$out + `16*$i+8`]
       
   353 	aes_kexpand2	%f2, %f0, %f2
       
   354 ___
       
   355 }
       
   356 $code.=<<___;
       
   357 	std		%f0, [$out + `16*$i+0`]
       
   358 	std		%f2, [$out + `16*$i+8`]
       
   359 
       
   360 	mov		10, $tmp
       
   361 	st		$tmp, [$out + 240]
       
   362 	retl
       
   363 	xor		%o0, %o0, %o0
       
   364 .type	aes_t4_set_encrypt_key,#function
       
   365 .size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
       
   366 
       
   367 .globl	aes_t4_set_decrypt_key
       
   368 .align	32
       
   369 aes_t4_set_decrypt_key:
       
   370 	mov		%o7, %o5
       
   371 	call		.Lset_encrypt_key
       
   372 	nop
       
   373 
       
   374 	mov		%o5, %o7
       
   375 	sll		$tmp, 4, $inp		! $tmp is number of rounds
       
   376 	add		$tmp, 2, $tmp
       
   377 	add		$out, $inp, $inp	! $inp=$out+16*rounds
       
   378 	srl		$tmp, 2, $tmp		! $tmp=(rounds+2)/4
       
   379 
       
   380 .Lkey_flip:
       
   381 	ldd		[$out + 0],  %f0
       
   382 	ldd		[$out + 8],  %f2
       
   383 	ldd		[$out + 16], %f4
       
   384 	ldd		[$out + 24], %f6
       
   385 	ldd		[$inp + 0],  %f8
       
   386 	ldd		[$inp + 8],  %f10
       
   387 	ldd		[$inp - 16], %f12
       
   388 	ldd		[$inp - 8],  %f14
       
   389 	sub		$tmp, 1, $tmp
       
   390 	std		%f0, [$inp + 0]
       
   391 	std		%f2, [$inp + 8]
       
   392 	std		%f4, [$inp - 16]
       
   393 	std		%f6, [$inp - 8]
       
   394 	std		%f8, [$out + 0]
       
   395 	std		%f10, [$out + 8]
       
   396 	std		%f12, [$out + 16]
       
   397 	std		%f14, [$out + 24]
       
   398 	add		$out, 32, $out
       
   399 	brnz		$tmp, .Lkey_flip
       
   400 	sub		$inp, 32, $inp
       
   401 
       
   402 	retl
       
   403 	xor		%o0, %o0, %o0
       
   404 .type	aes_t4_set_decrypt_key,#function
       
   405 .size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
       
   406 ___
       
   407 }
       
   408 
       
   409 {{{
       
   410 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
       
   411 my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
       
   412 
       
   413 $code.=<<___;
       
   414 .align	32
       
   415 _aes128_loadkey:
       
   416 	ldx		[$key + 0], %g4
       
   417 	ldx		[$key + 8], %g5
       
   418 ___
       
   419 for ($i=2; $i<22;$i++) {			# load key schedule
       
   420     $code.=<<___;
       
   421 	ldd		[$key + `8*$i`], %f`12+2*$i`
       
   422 ___
       
   423 }
       
   424 $code.=<<___;
       
   425 	retl
       
   426 	nop
       
   427 .type	_aes128_loadkey,#function
       
   428 .size	_aes128_loadkey,.-_aes128_loadkey
       
   429 _aes128_load_enckey=_aes128_loadkey
       
   430 _aes128_load_deckey=_aes128_loadkey
       
   431 
       
   432 .align	32
       
   433 _aes128_encrypt_1x:
       
   434 ___
       
   435 for ($i=0; $i<4; $i++) {
       
   436     $code.=<<___;
       
   437 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   438 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   439 	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   440 	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   441 ___
       
   442 }
       
   443 $code.=<<___;
       
   444 	aes_eround01	%f48, %f0, %f2, %f4
       
   445 	aes_eround23	%f50, %f0, %f2, %f2
       
   446 	aes_eround01_l	%f52, %f4, %f2, %f0
       
   447 	retl
       
   448 	aes_eround23_l	%f54, %f4, %f2, %f2
       
   449 .type	_aes128_encrypt_1x,#function
       
   450 .size	_aes128_encrypt_1x,.-_aes128_encrypt_1x
       
   451 
       
   452 .align	32
       
   453 _aes128_encrypt_2x:
       
   454 ___
       
   455 for ($i=0; $i<4; $i++) {
       
   456     $code.=<<___;
       
   457 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   458 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   459 	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   460 	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   461 	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   462 	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   463 	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   464 	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   465 ___
       
   466 }
       
   467 $code.=<<___;
       
   468 	aes_eround01	%f48, %f0, %f2, %f8
       
   469 	aes_eround23	%f50, %f0, %f2, %f2
       
   470 	aes_eround01	%f48, %f4, %f6, %f10
       
   471 	aes_eround23	%f50, %f4, %f6, %f6
       
   472 	aes_eround01_l	%f52, %f8, %f2, %f0
       
   473 	aes_eround23_l	%f54, %f8, %f2, %f2
       
   474 	aes_eround01_l	%f52, %f10, %f6, %f4
       
   475 	retl
       
   476 	aes_eround23_l	%f54, %f10, %f6, %f6
       
   477 .type	_aes128_encrypt_2x,#function
       
   478 .size	_aes128_encrypt_2x,.-_aes128_encrypt_2x
       
   479 
       
   480 .align	32
       
   481 _aes128_decrypt_1x:
       
   482 ___
       
   483 for ($i=0; $i<4; $i++) {
       
   484     $code.=<<___;
       
   485 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   486 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   487 	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   488 	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   489 ___
       
   490 }
       
   491 $code.=<<___;
       
   492 	aes_dround01	%f48, %f0, %f2, %f4
       
   493 	aes_dround23	%f50, %f0, %f2, %f2
       
   494 	aes_dround01_l	%f52, %f4, %f2, %f0
       
   495 	retl
       
   496 	aes_dround23_l	%f54, %f4, %f2, %f2
       
   497 .type	_aes128_decrypt_1x,#function
       
   498 .size	_aes128_decrypt_1x,.-_aes128_decrypt_1x
       
   499 
       
   500 .align	32
       
   501 _aes128_decrypt_2x:
       
   502 ___
       
   503 for ($i=0; $i<4; $i++) {
       
   504     $code.=<<___;
       
   505 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   506 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   507 	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   508 	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   509 	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   510 	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   511 	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   512 	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   513 ___
       
   514 }
       
   515 $code.=<<___;
       
   516 	aes_dround01	%f48, %f0, %f2, %f8
       
   517 	aes_dround23	%f50, %f0, %f2, %f2
       
   518 	aes_dround01	%f48, %f4, %f6, %f10
       
   519 	aes_dround23	%f50, %f4, %f6, %f6
       
   520 	aes_dround01_l	%f52, %f8, %f2, %f0
       
   521 	aes_dround23_l	%f54, %f8, %f2, %f2
       
   522 	aes_dround01_l	%f52, %f10, %f6, %f4
       
   523 	retl
       
   524 	aes_dround23_l	%f54, %f10, %f6, %f6
       
   525 .type	_aes128_decrypt_2x,#function
       
   526 .size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
       
   527 
       
   528 .align	32
       
   529 _aes192_loadkey:
       
   530 _aes256_loadkey:
       
   531 	ldx		[$key + 0], %g4
       
   532 	ldx		[$key + 8], %g5
       
   533 ___
       
   534 for ($i=2; $i<26;$i++) {			# load key schedule
       
   535     $code.=<<___;
       
   536 	ldd		[$key + `8*$i`], %f`12+2*$i`
       
   537 ___
       
   538 }
       
   539 $code.=<<___;
       
   540 	retl
       
   541 	nop
       
   542 .type	_aes192_loadkey,#function
       
   543 .size	_aes192_loadkey,.-_aes192_loadkey
       
   544 _aes192_load_enckey=_aes192_loadkey
       
   545 _aes192_load_deckey=_aes192_loadkey
       
   546 _aes256_load_enckey=_aes192_loadkey
       
   547 _aes256_load_deckey=_aes192_loadkey
       
   548 
       
   549 .align	32
       
   550 _aes192_encrypt_1x:
       
   551 ___
       
   552 for ($i=0; $i<5; $i++) {
       
   553     $code.=<<___;
       
   554 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   555 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   556 	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   557 	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   558 ___
       
   559 }
       
   560 $code.=<<___;
       
   561 	aes_eround01	%f56, %f0, %f2, %f4
       
   562 	aes_eround23	%f58, %f0, %f2, %f2
       
   563 	aes_eround01_l	%f60, %f4, %f2, %f0
       
   564 	retl
       
   565 	aes_eround23_l	%f62, %f4, %f2, %f2
       
   566 .type	_aes192_encrypt_1x,#function
       
   567 .size	_aes192_encrypt_1x,.-_aes192_encrypt_1x
       
   568 
       
   569 .align	32
       
   570 _aes192_encrypt_2x:
       
   571 ___
       
   572 for ($i=0; $i<5; $i++) {
       
   573     $code.=<<___;
       
   574 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   575 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   576 	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   577 	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   578 	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   579 	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   580 	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   581 	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   582 ___
       
   583 }
       
   584 $code.=<<___;
       
   585 	aes_eround01	%f56, %f0, %f2, %f8
       
   586 	aes_eround23	%f58, %f0, %f2, %f2
       
   587 	aes_eround01	%f56, %f4, %f6, %f10
       
   588 	aes_eround23	%f58, %f4, %f6, %f6
       
   589 	aes_eround01_l	%f60, %f8, %f2, %f0
       
   590 	aes_eround23_l	%f62, %f8, %f2, %f2
       
   591 	aes_eround01_l	%f60, %f10, %f6, %f4
       
   592 	retl
       
   593 	aes_eround23_l	%f62, %f10, %f6, %f6
       
   594 .type	_aes192_encrypt_2x,#function
       
   595 .size	_aes192_encrypt_2x,.-_aes192_encrypt_2x
       
   596 
       
   597 .align	32
       
   598 _aes192_decrypt_1x:
       
   599 ___
       
   600 for ($i=0; $i<5; $i++) {
       
   601     $code.=<<___;
       
   602 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   603 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   604 	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   605 	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   606 ___
       
   607 }
       
   608 $code.=<<___;
       
   609 	aes_dround01	%f56, %f0, %f2, %f4
       
   610 	aes_dround23	%f58, %f0, %f2, %f2
       
   611 	aes_dround01_l	%f60, %f4, %f2, %f0
       
   612 	retl
       
   613 	aes_dround23_l	%f62, %f4, %f2, %f2
       
   614 .type	_aes192_decrypt_1x,#function
       
   615 .size	_aes192_decrypt_1x,.-_aes192_decrypt_1x
       
   616 
       
   617 .align	32
       
   618 _aes192_decrypt_2x:
       
   619 ___
       
   620 for ($i=0; $i<5; $i++) {
       
   621     $code.=<<___;
       
   622 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   623 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   624 	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   625 	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   626 	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   627 	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   628 	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   629 	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   630 ___
       
   631 }
       
   632 $code.=<<___;
       
   633 	aes_dround01	%f56, %f0, %f2, %f8
       
   634 	aes_dround23	%f58, %f0, %f2, %f2
       
   635 	aes_dround01	%f56, %f4, %f6, %f10
       
   636 	aes_dround23	%f58, %f4, %f6, %f6
       
   637 	aes_dround01_l	%f60, %f8, %f2, %f0
       
   638 	aes_dround23_l	%f62, %f8, %f2, %f2
       
   639 	aes_dround01_l	%f60, %f10, %f6, %f4
       
   640 	retl
       
   641 	aes_dround23_l	%f62, %f10, %f6, %f6
       
   642 .type	_aes192_decrypt_2x,#function
       
   643 .size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
       
   644 
       
   645 .align	32
       
   646 _aes256_encrypt_1x:
       
   647 	aes_eround01	%f16, %f0, %f2, %f4
       
   648 	aes_eround23	%f18, %f0, %f2, %f2
       
   649 	ldd		[$key + 208], %f16
       
   650 	ldd		[$key + 216], %f18
       
   651 	aes_eround01	%f20, %f4, %f2, %f0
       
   652 	aes_eround23	%f22, %f4, %f2, %f2
       
   653 	ldd		[$key + 224], %f20
       
   654 	ldd		[$key + 232], %f22
       
   655 ___
       
   656 for ($i=1; $i<6; $i++) {
       
   657     $code.=<<___;
       
   658 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   659 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   660 	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   661 	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   662 ___
       
   663 }
       
   664 $code.=<<___;
       
   665 	aes_eround01	%f16, %f0, %f2, %f4
       
   666 	aes_eround23	%f18, %f0, %f2, %f2
       
   667 	ldd		[$key + 16], %f16
       
   668 	ldd		[$key + 24], %f18
       
   669 	aes_eround01_l	%f20, %f4, %f2, %f0
       
   670 	aes_eround23_l	%f22, %f4, %f2, %f2
       
   671 	ldd		[$key + 32], %f20
       
   672 	retl
       
   673 	ldd		[$key + 40], %f22
       
   674 .type	_aes256_encrypt_1x,#function
       
   675 .size	_aes256_encrypt_1x,.-_aes256_encrypt_1x
       
   676 
       
   677 .align	32
       
   678 _aes256_encrypt_2x:
       
   679 	aes_eround01	%f16, %f0, %f2, %f8
       
   680 	aes_eround23	%f18, %f0, %f2, %f2
       
   681 	aes_eround01	%f16, %f4, %f6, %f10
       
   682 	aes_eround23	%f18, %f4, %f6, %f6
       
   683 	ldd		[$key + 208], %f16
       
   684 	ldd		[$key + 216], %f18
       
   685 	aes_eround01	%f20, %f8, %f2, %f0
       
   686 	aes_eround23	%f22, %f8, %f2, %f2
       
   687 	aes_eround01	%f20, %f10, %f6, %f4
       
   688 	aes_eround23	%f22, %f10, %f6, %f6
       
   689 	ldd		[$key + 224], %f20
       
   690 	ldd		[$key + 232], %f22
       
   691 ___
       
   692 for ($i=1; $i<6; $i++) {
       
   693     $code.=<<___;
       
   694 	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   695 	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   696 	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   697 	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   698 	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   699 	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   700 	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   701 	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   702 ___
       
   703 }
       
   704 $code.=<<___;
       
   705 	aes_eround01	%f16, %f0, %f2, %f8
       
   706 	aes_eround23	%f18, %f0, %f2, %f2
       
   707 	aes_eround01	%f16, %f4, %f6, %f10
       
   708 	aes_eround23	%f18, %f4, %f6, %f6
       
   709 	ldd		[$key + 16], %f16
       
   710 	ldd		[$key + 24], %f18
       
   711 	aes_eround01_l	%f20, %f8, %f2, %f0
       
   712 	aes_eround23_l	%f22, %f8, %f2, %f2
       
   713 	aes_eround01_l	%f20, %f10, %f6, %f4
       
   714 	aes_eround23_l	%f22, %f10, %f6, %f6
       
   715 	ldd		[$key + 32], %f20
       
   716 	retl
       
   717 	ldd		[$key + 40], %f22
       
   718 .type	_aes256_encrypt_2x,#function
       
   719 .size	_aes256_encrypt_2x,.-_aes256_encrypt_2x
       
   720 
       
   721 .align	32
       
   722 _aes256_decrypt_1x:
       
   723 	aes_dround01	%f16, %f0, %f2, %f4
       
   724 	aes_dround23	%f18, %f0, %f2, %f2
       
   725 	ldd		[$key + 208], %f16
       
   726 	ldd		[$key + 216], %f18
       
   727 	aes_dround01	%f20, %f4, %f2, %f0
       
   728 	aes_dround23	%f22, %f4, %f2, %f2
       
   729 	ldd		[$key + 224], %f20
       
   730 	ldd		[$key + 232], %f22
       
   731 ___
       
   732 for ($i=1; $i<6; $i++) {
       
   733     $code.=<<___;
       
   734 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
       
   735 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   736 	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
       
   737 	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
       
   738 ___
       
   739 }
       
   740 $code.=<<___;
       
   741 	aes_dround01	%f16, %f0, %f2, %f4
       
   742 	aes_dround23	%f18, %f0, %f2, %f2
       
   743 	ldd		[$key + 16], %f16
       
   744 	ldd		[$key + 24], %f18
       
   745 	aes_dround01_l	%f20, %f4, %f2, %f0
       
   746 	aes_dround23_l	%f22, %f4, %f2, %f2
       
   747 	ldd		[$key + 32], %f20
       
   748 	retl
       
   749 	ldd		[$key + 40], %f22
       
   750 .type	_aes256_decrypt_1x,#function
       
   751 .size	_aes256_decrypt_1x,.-_aes256_decrypt_1x
       
   752 
       
   753 .align	32
       
   754 _aes256_decrypt_2x:
       
   755 	aes_dround01	%f16, %f0, %f2, %f8
       
   756 	aes_dround23	%f18, %f0, %f2, %f2
       
   757 	aes_dround01	%f16, %f4, %f6, %f10
       
   758 	aes_dround23	%f18, %f4, %f6, %f6
       
   759 	ldd		[$key + 208], %f16
       
   760 	ldd		[$key + 216], %f18
       
   761 	aes_dround01	%f20, %f8, %f2, %f0
       
   762 	aes_dround23	%f22, %f8, %f2, %f2
       
   763 	aes_dround01	%f20, %f10, %f6, %f4
       
   764 	aes_dround23	%f22, %f10, %f6, %f6
       
   765 	ldd		[$key + 224], %f20
       
   766 	ldd		[$key + 232], %f22
       
   767 ___
       
   768 for ($i=1; $i<6; $i++) {
       
   769     $code.=<<___;
       
   770 	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
       
   771 	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
       
   772 	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
       
   773 	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
       
   774 	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
       
   775 	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
       
   776 	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
       
   777 	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
       
   778 ___
       
   779 }
       
   780 $code.=<<___;
       
   781 	aes_dround01	%f16, %f0, %f2, %f8
       
   782 	aes_dround23	%f18, %f0, %f2, %f2
       
   783 	aes_dround01	%f16, %f4, %f6, %f10
       
   784 	aes_dround23	%f18, %f4, %f6, %f6
       
   785 	ldd		[$key + 16], %f16
       
   786 	ldd		[$key + 24], %f18
       
   787 	aes_dround01_l	%f20, %f8, %f2, %f0
       
   788 	aes_dround23_l	%f22, %f8, %f2, %f2
       
   789 	aes_dround01_l	%f20, %f10, %f6, %f4
       
   790 	aes_dround23_l	%f22, %f10, %f6, %f6
       
   791 	ldd		[$key + 32], %f20
       
   792 	retl
       
   793 	ldd		[$key + 40], %f22
       
   794 .type	_aes256_decrypt_2x,#function
       
   795 .size	_aes256_decrypt_2x,.-_aes256_decrypt_2x
       
   796 ___
       
   797 
       
   798 &alg_cbc_encrypt_implement("aes",128);
       
   799 &alg_cbc_encrypt_implement("aes",192);
       
   800 &alg_cbc_encrypt_implement("aes",256);
       
   801 
       
   802 &alg_cbc_decrypt_implement("aes",128);
       
   803 &alg_cbc_decrypt_implement("aes",192);
       
   804 &alg_cbc_decrypt_implement("aes",256);
       
   805 
       
   806 if ($::evp) {
       
   807     &alg_ctr32_implement("aes",128);
       
   808     &alg_ctr32_implement("aes",192);
       
   809     &alg_ctr32_implement("aes",256);
       
   810 }
       
   811 }}}
       
   812 
       
   813 if (!$::evp) {
       
   814 $code.=<<___;
       
   815 .global	AES_encrypt
       
   816 AES_encrypt=aes_t4_encrypt
       
   817 .global	AES_decrypt
       
   818 AES_decrypt=aes_t4_decrypt
       
   819 .global	AES_set_encrypt_key
       
   820 .align	32
       
   821 AES_set_encrypt_key:
       
   822 	andcc		%o2, 7, %g0		! check alignment
       
   823 	bnz,a,pn	%icc, 1f
       
   824 	mov		-1, %o0
       
   825 	brz,a,pn	%o0, 1f
       
   826 	mov		-1, %o0
       
   827 	brz,a,pn	%o2, 1f
       
   828 	mov		-1, %o0
       
   829 	andncc		%o1, 0x1c0, %g0
       
   830 	bnz,a,pn	%icc, 1f
       
   831 	mov		-2, %o0
       
   832 	cmp		%o1, 128
       
   833 	bl,a,pn		%icc, 1f
       
   834 	mov		-2, %o0
       
   835 	b		aes_t4_set_encrypt_key
       
   836 	nop
       
   837 1:	retl
       
   838 	nop
       
   839 .type	AES_set_encrypt_key,#function
       
   840 .size	AES_set_encrypt_key,.-AES_set_encrypt_key
       
   841 
       
   842 .global	AES_set_decrypt_key
       
   843 .align	32
       
   844 AES_set_decrypt_key:
       
   845 	andcc		%o2, 7, %g0		! check alignment
       
   846 	bnz,a,pn	%icc, 1f
       
   847 	mov		-1, %o0
       
   848 	brz,a,pn	%o0, 1f
       
   849 	mov		-1, %o0
       
   850 	brz,a,pn	%o2, 1f
       
   851 	mov		-1, %o0
       
   852 	andncc		%o1, 0x1c0, %g0
       
   853 	bnz,a,pn	%icc, 1f
       
   854 	mov		-2, %o0
       
   855 	cmp		%o1, 128
       
   856 	bl,a,pn		%icc, 1f
       
   857 	mov		-2, %o0
       
   858 	b		aes_t4_set_decrypt_key
       
   859 	nop
       
   860 1:	retl
       
   861 	nop
       
   862 .type	AES_set_decrypt_key,#function
       
   863 .size	AES_set_decrypt_key,.-AES_set_decrypt_key
       
   864 ___
       
   865 
       
   866 my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
       
   867 
       
   868 $code.=<<___;
       
   869 .globl	AES_cbc_encrypt
       
   870 .align	32
       
   871 AES_cbc_encrypt:
       
   872 	ld		[$key + 240], %g1
       
   873 	nop
       
   874 	brz		$enc, .Lcbc_decrypt
       
   875 	cmp		%g1, 12
       
   876 
       
   877 	bl,pt		%icc, aes128_t4_cbc_encrypt
       
   878 	nop
       
   879 	be,pn		%icc, aes192_t4_cbc_encrypt
       
   880 	nop
       
   881 	ba		aes256_t4_cbc_encrypt
       
   882 	nop
       
   883 
       
   884 .Lcbc_decrypt:
       
   885 	bl,pt		%icc, aes128_t4_cbc_decrypt
       
   886 	nop
       
   887 	be,pn		%icc, aes192_t4_cbc_decrypt
       
   888 	nop
       
   889 	ba		aes256_t4_cbc_decrypt
       
   890 	nop
       
   891 .type	AES_cbc_encrypt,#function
       
   892 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
       
   893 ___
       
   894 }
       
   895 $code.=<<___;
       
   896 .asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
       
   897 .align	4
       
   898 ___
       
   899 
       
   900 &emit_assembler();
       
   901 
       
   902 close STDOUT;