components/openssl/openssl-1.0.1/inline-t4/md5-sparcv9.pl
changeset 4822 1fb8a14c6702
parent 4821 54dafbe33fdb
child 4823 3ef8b7f4d9d8
equal deleted inserted replaced
4821:54dafbe33fdb 4822:1fb8a14c6702
     1 #!/usr/bin/env perl
       
     2 
       
     3 # ====================================================================
       
     4 # Written by Andy Polyakov <[email protected]> for the OpenSSL
       
     5 # project. The module is, however, dual licensed under OpenSSL and
       
     6 # CRYPTOGAMS licenses depending on where you obtain it. For further
       
     7 # details see http://www.openssl.org/~appro/cryptogams/.
       
     8 #
       
     9 # Hardware SPARC T4 support by David S. Miller <[email protected]>.
       
    10 # ====================================================================
       
    11 
       
    12 # MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
       
    13 # code generated by Sun C 5.2.
       
    14 
       
    15 # SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
       
    16 # faster than software. Multi-process benchmark saturates at 12x
       
    17 # single-process result on 8-core processor, or ~11GBps per 2.85GHz
       
    18 # socket.
       
    19 
       
    20 $bits=32;
       
    21 for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
       
    22 if ($bits==64)	{ $bias=2047; $frame=192; }
       
    23 else		{ $bias=0;    $frame=112; }
       
    24 
       
    25 $output=shift;
       
    26 open STDOUT,">$output";
       
    27 
       
    28 use integer;
       
    29 
       
    30 ($ctx,$inp,$len)=("%i0","%i1","%i2");	# input arguments
       
    31 
       
    32 # 64-bit values
       
    33 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
       
    34 $tx="%g3";
       
    35 ($AB,$CD)=("%g4","%g5");
       
    36 
       
    37 # 32-bit values
       
    38 @V=($A,$B,$C,$D)=map("%l$_",(0..3));
       
    39 ($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
       
    40 ($shr,$shl1,$shl2)=("%i3","%i4","%i5");
       
    41 
       
    42 my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
       
    43 	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
       
    44 	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
       
    45 	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
       
    46 
       
    47 	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
       
    48 	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
       
    49 	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
       
    50 	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
       
    51 
       
    52 	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
       
    53 	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
       
    54 	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
       
    55 	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
       
    56 
       
    57 	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
       
    58 	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
       
    59 	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
       
    60 	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0	);
       
    61 
       
    62 sub R0 {
       
    63   my ($i,$a,$b,$c,$d) = @_;
       
    64   my $rot = (7,12,17,22)[$i%4];
       
    65   my $j   = ($i+1)/2;
       
    66 
       
    67   if ($i&1) {
       
    68     $code.=<<___;
       
    69 	 srlx	@X[$j],$shr,@X[$j]	! align X[`$i+1`]
       
    70 	and	$b,$t1,$t1		! round $i
       
    71 	 sllx	@X[$j+1],$shl1,$tx
       
    72 	add	$t2,$a,$a
       
    73 	 sllx	$tx,$shl2,$tx
       
    74 	xor	$d,$t1,$t1
       
    75 	 or	$tx,@X[$j],@X[$j]
       
    76 	 sethi	%hi(@K[$i+1]),$t2
       
    77 	add	$t1,$a,$a
       
    78 	 or	$t2,%lo(@K[$i+1]),$t2
       
    79 	sll	$a,$rot,$t3
       
    80 	 add	@X[$j],$t2,$t2		! X[`$i+1`]+K[`$i+1`]
       
    81 	srl	$a,32-$rot,$a
       
    82 	add	$b,$t3,$t3
       
    83 	 xor	 $b,$c,$t1
       
    84 	add	$t3,$a,$a
       
    85 ___
       
    86   } else {
       
    87     $code.=<<___;
       
    88 	 srlx	@X[$j],32,$tx		! extract X[`2*$j+1`]
       
    89 	and	$b,$t1,$t1		! round $i
       
    90 	add	$t2,$a,$a
       
    91 	xor	$d,$t1,$t1
       
    92 	 sethi	%hi(@K[$i+1]),$t2
       
    93 	add	$t1,$a,$a
       
    94 	 or	$t2,%lo(@K[$i+1]),$t2
       
    95 	sll	$a,$rot,$t3
       
    96 	 add	$tx,$t2,$t2		! X[`2*$j+1`]+K[`$i+1`]
       
    97 	srl	$a,32-$rot,$a
       
    98 	add	$b,$t3,$t3
       
    99 	 xor	 $b,$c,$t1
       
   100 	add	$t3,$a,$a
       
   101 ___
       
   102   }
       
   103 }
       
   104 
       
   105 sub R0_1 {
       
   106   my ($i,$a,$b,$c,$d) = @_;
       
   107   my $rot = (7,12,17,22)[$i%4];
       
   108 
       
   109 $code.=<<___;
       
   110 	 srlx	@X[0],32,$tx		! extract X[1]
       
   111 	and	$b,$t1,$t1		! round $i
       
   112 	add	$t2,$a,$a
       
   113 	xor	$d,$t1,$t1
       
   114 	 sethi	%hi(@K[$i+1]),$t2
       
   115 	add	$t1,$a,$a
       
   116 	 or	$t2,%lo(@K[$i+1]),$t2
       
   117 	sll	$a,$rot,$t3
       
   118 	 add	$tx,$t2,$t2		! X[1]+K[`$i+1`]
       
   119 	srl	$a,32-$rot,$a
       
   120 	add	$b,$t3,$t3
       
   121 	 andn	 $b,$c,$t1
       
   122 	add	$t3,$a,$a
       
   123 ___
       
   124 }
       
   125 
       
   126 sub R1 {
       
   127   my ($i,$a,$b,$c,$d) = @_;
       
   128   my $rot = (5,9,14,20)[$i%4];
       
   129   my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
       
   130   my $xi  = @X[$j/2];
       
   131 
       
   132 $code.=<<___ if ($j&1 && ($xi=$tx));
       
   133 	 srlx	@X[$j/2],32,$xi		! extract X[$j]
       
   134 ___
       
   135 $code.=<<___;
       
   136 	and	$b,$d,$t3		! round $i
       
   137 	add	$t2,$a,$a
       
   138 	or	$t3,$t1,$t1
       
   139 	 sethi	%hi(@K[$i+1]),$t2
       
   140 	add	$t1,$a,$a
       
   141 	 or	$t2,%lo(@K[$i+1]),$t2
       
   142 	sll	$a,$rot,$t3
       
   143 	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
       
   144 	srl	$a,32-$rot,$a
       
   145 	add	$b,$t3,$t3
       
   146 	 `$i<31?"andn":"xor"`	 $b,$c,$t1
       
   147 	add	$t3,$a,$a
       
   148 ___
       
   149 }
       
   150 
       
   151 sub R2 {
       
   152   my ($i,$a,$b,$c,$d) = @_;
       
   153   my $rot = (4,11,16,23)[$i%4];
       
   154   my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
       
   155   my $xi  = @X[$j/2];
       
   156 
       
   157 $code.=<<___ if ($j&1 && ($xi=$tx));
       
   158 	 srlx	@X[$j/2],32,$xi		! extract X[$j]
       
   159 ___
       
   160 $code.=<<___;
       
   161 	add	$t2,$a,$a		! round $i
       
   162 	xor	$b,$t1,$t1
       
   163 	 sethi	%hi(@K[$i+1]),$t2
       
   164 	add	$t1,$a,$a
       
   165 	 or	$t2,%lo(@K[$i+1]),$t2
       
   166 	sll	$a,$rot,$t3
       
   167 	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
       
   168 	srl	$a,32-$rot,$a
       
   169 	add	$b,$t3,$t3
       
   170 	 xor	 $b,$c,$t1
       
   171 	add	$t3,$a,$a
       
   172 ___
       
   173 }
       
   174 
       
   175 sub R3 {
       
   176   my ($i,$a,$b,$c,$d) = @_;
       
   177   my $rot = (6,10,15,21)[$i%4];
       
   178   my $j   = (0+7*($i+1))%16;
       
   179   my $xi  = @X[$j/2];
       
   180 
       
   181 $code.=<<___;
       
   182 	add	$t2,$a,$a		! round $i
       
   183 ___
       
   184 $code.=<<___ if ($j&1 && ($xi=$tx));
       
   185 	 srlx	@X[$j/2],32,$xi		! extract X[$j]
       
   186 ___
       
   187 $code.=<<___;
       
   188 	orn	$b,$d,$t1
       
   189 	 sethi	%hi(@K[$i+1]),$t2
       
   190 	xor	$c,$t1,$t1
       
   191 	 or	$t2,%lo(@K[$i+1]),$t2
       
   192 	add	$t1,$a,$a
       
   193 	sll	$a,$rot,$t3
       
   194 	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
       
   195 	srl	$a,32-$rot,$a
       
   196 	add	$b,$t3,$t3
       
   197 	add	$t3,$a,$a
       
   198 ___
       
   199 }
       
   200 
       
   201 $code.=<<___ if ($bits==64);
       
   202 .register	%g2,#scratch
       
   203 .register	%g3,#scratch
       
   204 ___
       
   205 $code.=<<___;
       
   206 #include "sparc_arch.h"
       
   207 
       
   208 .section	".text",#alloc,#execinstr
       
   209 
       
   210 #ifdef __PIC__
       
   211 SPARC_PIC_THUNK(%g1)
       
   212 #endif
       
   213 
       
   214 .globl	md5_block_asm_data_order
       
   215 .align	32
       
   216 md5_block_asm_data_order:
       
   217 	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
       
   218 	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
       
   219 
       
   220 	andcc	%g1, CFR_MD5, %g0
       
   221 	be	.Lsoftware
       
   222 	nop
       
   223 
       
   224 	mov	4, %g1
       
   225 	andcc	%o1, 0x7, %g0
       
   226 	lda	[%o0 + %g0]0x88, %f0		! load context
       
   227 	lda	[%o0 + %g1]0x88, %f1
       
   228 	add	%o0, 8, %o0
       
   229 	lda	[%o0 + %g0]0x88, %f2
       
   230 	lda	[%o0 + %g1]0x88, %f3
       
   231 	bne,pn	%icc, .Lhwunaligned
       
   232 	sub	%o0, 8, %o0
       
   233 
       
   234 .Lhw_loop:
       
   235 	ldd	[%o1 + 0x00], %f8
       
   236 	ldd	[%o1 + 0x08], %f10
       
   237 	ldd	[%o1 + 0x10], %f12
       
   238 	ldd	[%o1 + 0x18], %f14
       
   239 	ldd	[%o1 + 0x20], %f16
       
   240 	ldd	[%o1 + 0x28], %f18
       
   241 	ldd	[%o1 + 0x30], %f20
       
   242 	subcc	%o2, 1, %o2		! done yet? 
       
   243 	ldd	[%o1 + 0x38], %f22
       
   244 	add	%o1, 0x40, %o1
       
   245 	prefetch [%o1 + 63], 20
       
   246 
       
   247 	.word	0x81b02800		! MD5
       
   248 
       
   249 	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhw_loop
       
   250 	nop
       
   251 
       
   252 .Lhwfinish:
       
   253 	sta	%f0, [%o0 + %g0]0x88	! store context
       
   254 	sta	%f1, [%o0 + %g1]0x88
       
   255 	add	%o0, 8, %o0
       
   256 	sta	%f2, [%o0 + %g0]0x88
       
   257 	sta	%f3, [%o0 + %g1]0x88
       
   258 	retl
       
   259 	nop
       
   260 
       
   261 .align	8
       
   262 .Lhwunaligned:
       
   263 	alignaddr %o1, %g0, %o1
       
   264 
       
   265 	ldd	[%o1 + 0x00], %f10
       
   266 .Lhwunaligned_loop:
       
   267 	ldd	[%o1 + 0x08], %f12
       
   268 	ldd	[%o1 + 0x10], %f14
       
   269 	ldd	[%o1 + 0x18], %f16
       
   270 	ldd	[%o1 + 0x20], %f18
       
   271 	ldd	[%o1 + 0x28], %f20
       
   272 	ldd	[%o1 + 0x30], %f22
       
   273 	ldd	[%o1 + 0x38], %f24
       
   274 	subcc	%o2, 1, %o2		! done yet?
       
   275 	ldd	[%o1 + 0x40], %f26
       
   276 	add	%o1, 0x40, %o1
       
   277 	prefetch [%o1 + 63], 20
       
   278 
       
   279 	faligndata %f10, %f12, %f8
       
   280 	faligndata %f12, %f14, %f10
       
   281 	faligndata %f14, %f16, %f12
       
   282 	faligndata %f16, %f18, %f14
       
   283 	faligndata %f18, %f20, %f16
       
   284 	faligndata %f20, %f22, %f18
       
   285 	faligndata %f22, %f24, %f20
       
   286 	faligndata %f24, %f26, %f22
       
   287 
       
   288 	.word	0x81b02800		! MD5
       
   289 
       
   290 	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
       
   291 	for	%f26, %f26, %f10	! %f10=%f26
       
   292 
       
   293 	ba	.Lhwfinish
       
   294 	nop
       
   295 
       
   296 .align	16
       
   297 .Lsoftware:
       
   298 	save	%sp,-$frame,%sp
       
   299 
       
   300 	rd	%asi,$saved_asi
       
   301 	wr	%g0,0x88,%asi		! ASI_PRIMARY_LITTLE
       
   302 	and	$inp,7,$shr
       
   303 	andn	$inp,7,$inp
       
   304 
       
   305 	sll	$shr,3,$shr		! *=8
       
   306 	mov	56,$shl2
       
   307 	ld	[$ctx+0],$A
       
   308 	sub	$shl2,$shr,$shl2
       
   309 	ld	[$ctx+4],$B
       
   310 	and	$shl2,32,$shl1
       
   311 	add	$shl2,8,$shl2
       
   312 	ld	[$ctx+8],$C
       
   313 	sub	$shl2,$shl1,$shl2	! shr+shl1+shl2==64
       
   314 	ld	[$ctx+12],$D
       
   315 	nop
       
   316 
       
   317 .Loop:
       
   318 	 cmp	$shr,0			! was inp aligned?
       
   319 	ldxa	[$inp+0]%asi,@X[0]	! load little-endian input
       
   320 	ldxa	[$inp+8]%asi,@X[1]
       
   321 	ldxa	[$inp+16]%asi,@X[2]
       
   322 	ldxa	[$inp+24]%asi,@X[3]
       
   323 	ldxa	[$inp+32]%asi,@X[4]
       
   324 	 sllx	$A,32,$AB		! pack A,B
       
   325 	ldxa	[$inp+40]%asi,@X[5]
       
   326 	 sllx	$C,32,$CD		! pack C,D
       
   327 	ldxa	[$inp+48]%asi,@X[6]
       
   328 	 or	$B,$AB,$AB
       
   329 	ldxa	[$inp+56]%asi,@X[7]
       
   330 	 or	$D,$CD,$CD
       
   331 	bnz,a,pn	%icc,.+8
       
   332 	ldxa	[$inp+64]%asi,@X[8]
       
   333 
       
   334 	srlx	@X[0],$shr,@X[0]	! align X[0]
       
   335 	sllx	@X[1],$shl1,$tx
       
   336 	 sethi	%hi(@K[0]),$t2
       
   337 	sllx	$tx,$shl2,$tx
       
   338 	 or	$t2,%lo(@K[0]),$t2
       
   339 	or	$tx,@X[0],@X[0]
       
   340 	 xor	$C,$D,$t1
       
   341 	 add	@X[0],$t2,$t2		! X[0]+K[0]
       
   342 ___
       
   343 	for ($i=0;$i<15;$i++)	{ &R0($i,@V);	unshift(@V,pop(@V)); }
       
   344 	for (;$i<16;$i++)	{ &R0_1($i,@V);	unshift(@V,pop(@V)); }
       
   345 	for (;$i<32;$i++)	{ &R1($i,@V);	unshift(@V,pop(@V)); }
       
   346 	for (;$i<48;$i++)	{ &R2($i,@V);	unshift(@V,pop(@V)); }
       
   347 	for (;$i<64;$i++)	{ &R3($i,@V);	unshift(@V,pop(@V)); }
       
   348 $code.=<<___;
       
   349 	srlx	$AB,32,$t1		! unpack A,B,C,D and accumulate
       
   350 	add	$inp,64,$inp		! advance inp
       
   351 	srlx	$CD,32,$t2
       
   352 	add	$t1,$A,$A
       
   353 	subcc	$len,1,$len		! done yet?
       
   354 	add	$AB,$B,$B
       
   355 	add	$t2,$C,$C
       
   356 	add	$CD,$D,$D
       
   357 	srl	$B,0,$B			! clruw	$B
       
   358 	bne	`$bits==64?"%xcc":"%icc"`,.Loop
       
   359 	srl	$D,0,$D			! clruw	$D
       
   360 
       
   361 	st	$A,[$ctx+0]		! write out ctx
       
   362 	st	$B,[$ctx+4]
       
   363 	st	$C,[$ctx+8]
       
   364 	st	$D,[$ctx+12]
       
   365 
       
   366 	wr	%g0,$saved_asi,%asi
       
   367 	ret
       
   368 	restore
       
   369 .type	md5_block_asm_data_order,#function
       
   370 .size	md5_block_asm_data_order,(.-md5_block_asm_data_order)
       
   371 
       
   372 .asciz	"MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
       
   373 .align	4
       
   374 ___
       
   375 
       
   376 # Purpose of these subroutines is to explicitly encode VIS instructions,
       
   377 # so that one can compile the module without having to specify VIS
       
   378 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
       
   379 # Idea is to reserve for option to produce "universal" binary and let
       
   380 # programmer detect if current CPU is VIS capable at run-time.
       
   381 sub unvis {
       
   382 my ($mnemonic,$rs1,$rs2,$rd)=@_;
       
   383 my $ref,$opf;
       
   384 my %visopf = (	"faligndata"	=> 0x048,
       
   385 		"for"		=> 0x07c	);
       
   386 
       
   387     $ref = "$mnemonic\t$rs1,$rs2,$rd";
       
   388 
       
   389     if ($opf=$visopf{$mnemonic}) {
       
   390 	foreach ($rs1,$rs2,$rd) {
       
   391 	    return $ref if (!/%f([0-9]{1,2})/);
       
   392 	    $_=$1;
       
   393 	    if ($1>=32) {
       
   394 		return $ref if ($1&1);
       
   395 		# re-encode for upper double register addressing
       
   396 		$_=($1|$1>>5)&31;
       
   397 	    }
       
   398 	}
       
   399 
       
   400 	return	sprintf ".word\t0x%08x !%s",
       
   401 			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
       
   402 			$ref;
       
   403     } else {
       
   404 	return $ref;
       
   405     }
       
   406 }
       
   407 sub unalignaddr {
       
   408 my ($mnemonic,$rs1,$rs2,$rd)=@_;
       
   409 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
       
   410 my $ref="$mnemonic\t$rs1,$rs2,$rd";
       
   411 
       
   412     foreach ($rs1,$rs2,$rd) {
       
   413 	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
       
   414 	else			{ return $ref; }
       
   415     }
       
   416     return  sprintf ".word\t0x%08x !%s",
       
   417 		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
       
   418 		    $ref;
       
   419 }
       
   420 
       
   421 foreach (split("\n",$code)) {
       
   422 	s/\`([^\`]*)\`/eval $1/ge;
       
   423 
       
   424 	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
       
   425 		&unvis($1,$2,$3,$4)
       
   426 	 /ge;
       
   427 	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
       
   428 		&unalignaddr($1,$2,$3,$4)
       
   429 	 /ge;
       
   430 
       
   431 	print $_,"\n";
       
   432 }
       
   433 
       
   434 close STDOUT;