components/openssl/openssl-1.0.1/inline-t4/sparct4-mont.pl
branchs11u3-sru
changeset 7163 ee09edbd5876
parent 7159 59b406bc4a3a
child 7164 b2abbab8e6d5
equal deleted inserted replaced
7159:59b406bc4a3a 7163:ee09edbd5876
     1 #!/usr/bin/env perl
       
     2 
       
     3 # ====================================================================
       
     4 # Written by David S. Miller <[email protected]> and Andy Polyakov
       
     5 # <[email protected]>. The module is licensed under 2-clause BSD
       
     6 # license. November 2012. All rights reserved.
       
     7 # ====================================================================
       
     8 
       
     9 ######################################################################
       
    10 # Montgomery squaring-n-multiplication module for SPARC T4.
       
    11 #
       
    12 # The module consists of three parts:
       
    13 #
       
    14 # 1) collection of "single-op" subroutines that perform single
       
    15 #    operation, Montgomery squaring or multiplication, on 512-,
       
    16 #    1024-, 1536- and 2048-bit operands;
       
    17 # 2) collection of "multi-op" subroutines that perform 5 squaring and
       
    18 #    1 multiplication operations on operands of above lengths;
       
    19 # 3) fall-back and helper VIS3 subroutines.
       
    20 #
       
    21 # RSA sign is dominated by multi-op subroutine, while RSA verify and
       
    22 # DSA - by single-op. Special note about 4096-bit RSA verify result.
       
    23 # Operands are too long for dedicated hardware and it's handled by
       
    24 # VIS3 code, which is why you don't see any improvement. It's surely
       
    25 # possible to improve it [by deploying 'mpmul' instruction], maybe in
       
    26 # the future...
       
    27 #
       
    28 # Performance improvement.
       
    29 #
       
    30 # 64-bit process, VIS3:
       
    31 #                   sign    verify    sign/s verify/s
       
    32 # rsa 1024 bits 0.000628s 0.000028s   1592.4  35434.4
       
    33 # rsa 2048 bits 0.003282s 0.000106s    304.7   9438.3
       
    34 # rsa 4096 bits 0.025866s 0.000340s     38.7   2940.9
       
    35 # dsa 1024 bits 0.000301s 0.000332s   3323.7   3013.9
       
    36 # dsa 2048 bits 0.001056s 0.001233s    946.9    810.8
       
    37 #
       
    38 # 64-bit process, this module:
       
    39 #                   sign    verify    sign/s verify/s
       
    40 # rsa 1024 bits 0.000256s 0.000016s   3904.4  61411.9
       
    41 # rsa 2048 bits 0.000946s 0.000029s   1056.8  34292.7
       
    42 # rsa 4096 bits 0.005061s 0.000340s    197.6   2940.5
       
    43 # dsa 1024 bits 0.000176s 0.000195s   5674.7   5130.5
       
    44 # dsa 2048 bits 0.000296s 0.000354s   3383.2   2827.6
       
    45 #
       
    46 ######################################################################
       
    47 # 32-bit process, VIS3:
       
    48 #                   sign    verify    sign/s verify/s
       
    49 # rsa 1024 bits 0.000665s 0.000028s   1504.8  35233.3
       
    50 # rsa 2048 bits 0.003349s 0.000106s    298.6   9433.4
       
    51 # rsa 4096 bits 0.025959s 0.000341s     38.5   2934.8
       
    52 # dsa 1024 bits 0.000320s 0.000341s   3123.3   2929.6
       
    53 # dsa 2048 bits 0.001101s 0.001260s    908.2    793.4
       
    54 #
       
    55 # 32-bit process, this module:
       
    56 #                   sign    verify    sign/s verify/s
       
    57 # rsa 1024 bits 0.000301s 0.000017s   3317.1  60240.0
       
    58 # rsa 2048 bits 0.001034s 0.000030s    966.9  33812.7
       
    59 # rsa 4096 bits 0.005244s 0.000341s    190.7   2935.4
       
    60 # dsa 1024 bits 0.000201s 0.000205s   4976.1   4879.2
       
    61 # dsa 2048 bits 0.000328s 0.000360s   3051.1   2774.2
       
    62 #
       
    63 # 32-bit code is prone to performance degradation as interrupt rate
       
    64 # dispatched to CPU executing the code grows. This is because in
       
    65 # standard process of handling interrupt in 32-bit process context
       
    66 # upper halves of most integer registers used as input or output are
       
    67 # zeroed. This renders result invalid, and operation has to be re-run.
       
    68 # If CPU is "bothered" with timer interrupts only, the penalty is
       
    69 # hardly measurable. But in order to mitigate this problem for higher
       
    70 # interrupt rates contemporary Linux kernel recognizes biased stack
       
    71 # even in 32-bit process context and preserves full register contents.
       
    72 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
       
    73 # for details.
       
    74 
       
    75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
       
    76 push(@INC,"${dir}","${dir}../../perlasm");
       
    77 require "sparcv9_modes.pl";
       
    78 
       
    79 $code.=<<___;
       
    80 #include "sparc_arch.h"
       
    81 
       
    82 #ifdef	__arch64__
       
    83 .register	%g2,#scratch
       
    84 .register	%g3,#scratch
       
    85 #endif
       
    86 
       
    87 .section	".text",#alloc,#execinstr
       
    88 
       
    89 #ifdef	__PIC__
       
    90 SPARC_PIC_THUNK(%g1)
       
    91 #endif
       
    92 ___
       
    93 
       
    94 ########################################################################
       
    95 # Register layout for mont[mul|sqr] instructions.
       
    96 # For details see "Oracle SPARC Architecture 2011" manual at
       
    97 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
       
    98 #
       
    99 my @R=map("%f".2*$_,(0..11,30,31,12..29));
       
   100 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
       
   101 my @A=(@N[0..13],@R[14..31]);
       
   102 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
       
   103 
       
   104 ########################################################################
       
   105 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
       
   106 #			  const u64 *np,const BN_ULONG *n0);
       
   107 #
       
   108 sub generate_bn_mul_mont_t4() {
       
   109 my $NUM=shift;
       
   110 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
       
   111 
       
   112 $code.=<<___;
       
   113 .globl	bn_mul_mont_t4_$NUM
       
   114 .align	32
       
   115 bn_mul_mont_t4_$NUM:
       
   116 #ifdef	__arch64__
       
   117 	mov	0,$sentinel
       
   118 	mov	-128,%g4
       
   119 #elif defined(SPARCV9_64BIT_STACK)
       
   120 	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
       
   121 	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
       
   122 	mov	-2047,%g4
       
   123 	and	%g1,SPARCV9_64BIT_STACK,%g1
       
   124 	movrz	%g1,0,%g4
       
   125 	mov	-1,$sentinel
       
   126 	add	%g4,-128,%g4
       
   127 #else
       
   128 	mov	-1,$sentinel
       
   129 	mov	-128,%g4
       
   130 #endif
       
   131 	sllx	$sentinel,32,$sentinel
       
   132 	save	%sp,%g4,%sp
       
   133 #ifndef	__arch64__
       
   134 	save	%sp,-128,%sp	! warm it up
       
   135 	save	%sp,-128,%sp
       
   136 	save	%sp,-128,%sp
       
   137 	save	%sp,-128,%sp
       
   138 	save	%sp,-128,%sp
       
   139 	save	%sp,-128,%sp
       
   140 	restore
       
   141 	restore
       
   142 	restore
       
   143 	restore
       
   144 	restore
       
   145 	restore
       
   146 #endif
       
   147 	and	%sp,1,%g4
       
   148 	or	$sentinel,%fp,%fp
       
   149 	or	%g4,$sentinel,$sentinel
       
   150 
       
   151 	! copy arguments to global registers
       
   152 	mov	%i0,$rp
       
   153 	mov	%i1,$ap
       
   154 	mov	%i2,$bp
       
   155 	mov	%i3,$np
       
   156 	ld	[%i4+0],%f1	! load *n0
       
   157 	ld	[%i4+4],%f0
       
   158 	fsrc2	%f0,%f60
       
   159 ___
       
   160 
       
   161 # load ap[$NUM] ########################################################
       
   162 $code.=<<___;
       
   163 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   164 ___
       
   165 for($i=0; $i<14 && $i<$NUM; $i++) {
       
   166 my $lo=$i<13?@A[$i+1]:"%o7";
       
   167 $code.=<<___;
       
   168 	ld	[$ap+$i*8+0],$lo
       
   169 	ld	[$ap+$i*8+4],@A[$i]
       
   170 	sllx	@A[$i],32,@A[$i]
       
   171 	or	$lo,@A[$i],@A[$i]
       
   172 ___
       
   173 }
       
   174 for(; $i<$NUM; $i++) {
       
   175 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
       
   176 $code.=<<___;
       
   177 	ld	[$ap+$i*8+0],$lo
       
   178 	ld	[$ap+$i*8+4],$hi
       
   179 	fsrc2	$hi,@A[$i]
       
   180 ___
       
   181 }
       
   182 # load np[$NUM] ########################################################
       
   183 $code.=<<___;
       
   184 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   185 ___
       
   186 for($i=0; $i<14 && $i<$NUM; $i++) {
       
   187 my $lo=$i<13?@N[$i+1]:"%o7";
       
   188 $code.=<<___;
       
   189 	ld	[$np+$i*8+0],$lo
       
   190 	ld	[$np+$i*8+4],@N[$i]
       
   191 	sllx	@N[$i],32,@N[$i]
       
   192 	or	$lo,@N[$i],@N[$i]
       
   193 ___
       
   194 }
       
   195 $code.=<<___;
       
   196 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   197 ___
       
   198 for(; $i<28 && $i<$NUM; $i++) {
       
   199 my $lo=$i<27?@N[$i+1]:"%o7";
       
   200 $code.=<<___;
       
   201 	ld	[$np+$i*8+0],$lo
       
   202 	ld	[$np+$i*8+4],@N[$i]
       
   203 	sllx	@N[$i],32,@N[$i]
       
   204 	or	$lo,@N[$i],@N[$i]
       
   205 ___
       
   206 }
       
   207 $code.=<<___;
       
   208 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   209 ___
       
   210 for(; $i<$NUM; $i++) {
       
   211 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
       
   212 $code.=<<___;
       
   213 	ld	[$np+$i*8+0],$lo
       
   214 	ld	[$np+$i*8+4],@N[$i]
       
   215 	sllx	@N[$i],32,@N[$i]
       
   216 	or	$lo,@N[$i],@N[$i]
       
   217 ___
       
   218 }
       
   219 $code.=<<___;
       
   220 	cmp	$ap,$bp
       
   221 	be	SIZE_T_CC,.Lmsquare_$NUM
       
   222 	nop
       
   223 ___
       
   224 
       
   225 # load bp[$NUM] ########################################################
       
   226 $code.=<<___;
       
   227 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   228 ___
       
   229 for($i=0; $i<14 && $i<$NUM; $i++) {
       
   230 my $lo=$i<13?@B[$i+1]:"%o7";
       
   231 $code.=<<___;
       
   232 	ld	[$bp+$i*8+0],$lo
       
   233 	ld	[$bp+$i*8+4],@B[$i]
       
   234 	sllx	@B[$i],32,@B[$i]
       
   235 	or	$lo,@B[$i],@B[$i]
       
   236 ___
       
   237 }
       
   238 $code.=<<___;
       
   239 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   240 ___
       
   241 for(; $i<$NUM; $i++) {
       
   242 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
       
   243 $code.=<<___;
       
   244 	ld	[$bp+$i*8+0],$lo
       
   245 	ld	[$bp+$i*8+4],@B[$i]
       
   246 	sllx	@B[$i],32,@B[$i]
       
   247 	or	$lo,@B[$i],@B[$i]
       
   248 ___
       
   249 }
       
   250 # magic ################################################################
       
   251 $code.=<<___;
       
   252 	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
       
   253 .Lmresume_$NUM:
       
   254 	fbu,pn	%fcc3,.Lmabort_$NUM
       
   255 #ifndef	__arch64__
       
   256 	and	%fp,$sentinel,$sentinel
       
   257 	brz,pn	$sentinel,.Lmabort_$NUM
       
   258 #endif
       
   259 	nop
       
   260 #ifdef	__arch64__
       
   261 	restore
       
   262 	restore
       
   263 	restore
       
   264 	restore
       
   265 	restore
       
   266 #else
       
   267 	restore;		and	%fp,$sentinel,$sentinel
       
   268 	restore;		and	%fp,$sentinel,$sentinel
       
   269 	restore;		and	%fp,$sentinel,$sentinel
       
   270 	restore;		and	%fp,$sentinel,$sentinel
       
   271 	 brz,pn	$sentinel,.Lmabort1_$NUM
       
   272 	restore
       
   273 #endif
       
   274 ___
       
   275 
       
   276 # save tp[$NUM] ########################################################
       
   277 for($i=0; $i<14 && $i<$NUM; $i++) {
       
   278 $code.=<<___;
       
   279 	movxtod	@A[$i],@R[$i]
       
   280 ___
       
   281 }
       
   282 $code.=<<___;
       
   283 #ifdef	__arch64__
       
   284 	restore
       
   285 #else
       
   286 	 and	%fp,$sentinel,$sentinel
       
   287 	restore
       
   288 	 and	$sentinel,1,%o7
       
   289 	 and	%fp,$sentinel,$sentinel
       
   290 	 srl	%fp,0,%fp		! just in case?
       
   291 	 or	%o7,$sentinel,$sentinel
       
   292 	brz,a,pn $sentinel,.Lmdone_$NUM
       
   293 	mov	0,%i0		! return failure
       
   294 #endif
       
   295 ___
       
   296 for($i=0; $i<12 && $i<$NUM; $i++) {
       
   297 @R[$i] =~ /%f([0-9]+)/;
       
   298 my $lo = "%f".($1+1);
       
   299 $code.=<<___;
       
   300 	st	$lo,[$rp+$i*8+0]
       
   301 	st	@R[$i],[$rp+$i*8+4]
       
   302 ___
       
   303 }
       
   304 for(; $i<$NUM; $i++) {
       
   305 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
       
   306 $code.=<<___;
       
   307 	fsrc2	@R[$i],$hi
       
   308 	st	$lo,[$rp+$i*8+0]
       
   309 	st	$hi,[$rp+$i*8+4]
       
   310 ___
       
   311 }
       
   312 $code.=<<___;
       
   313 	mov	1,%i0		! return success
       
   314 .Lmdone_$NUM:
       
   315 	ret
       
   316 	restore
       
   317 
       
   318 .Lmabort_$NUM:
       
   319 	restore
       
   320 	restore
       
   321 	restore
       
   322 	restore
       
   323 	restore
       
   324 .Lmabort1_$NUM:
       
   325 	restore
       
   326 
       
   327 	mov	0,%i0		! return failure
       
   328 	ret
       
   329 	restore
       
   330 
       
   331 .align	32
       
   332 .Lmsquare_$NUM:
       
   333 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   334 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   335 	.word   0x81b02940+$NUM-1	! montsqr	$NUM-1
       
   336 	ba	.Lmresume_$NUM
       
   337 	nop
       
   338 .type	bn_mul_mont_t4_$NUM, #function
       
   339 .size	bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
       
   340 ___
       
   341 }
       
   342 
       
   343 for ($i=8;$i<=32;$i+=8) {
       
   344 	&generate_bn_mul_mont_t4($i);
       
   345 }
       
   346 
       
   347 ########################################################################
       
   348 #
       
   349 sub load_ccr {
       
   350 my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
       
   351 $code.=<<___;
       
   352 	srl	$pwr,	2,	%o4
       
   353 	and	$pwr,	3,	%o5
       
   354 	and	%o4,	7,	%o4
       
   355 	sll	%o5,	3,	%o5	! offset within first cache line
       
   356 	add	%o5,	$ptbl,	$ptbl	! of the pwrtbl
       
   357 	or	%g0,	1,	%o5
       
   358 	sll	%o5,	%o4,	$ccr
       
   359 ___
       
   360 $code.=<<___	if (!$skip_wr);
       
   361 	wr	$ccr,	%g0,	%ccr
       
   362 ___
       
   363 }
       
   364 sub load_b_pair {
       
   365 my ($pwrtbl,$B0,$B1)=@_;
       
   366 
       
   367 $code.=<<___;
       
   368 	ldx	[$pwrtbl+0*32],	$B0
       
   369 	ldx	[$pwrtbl+8*32],	$B1
       
   370 	ldx	[$pwrtbl+1*32],	%o4
       
   371 	ldx	[$pwrtbl+9*32],	%o5
       
   372 	movvs	%icc,	%o4,	$B0
       
   373 	ldx	[$pwrtbl+2*32],	%o4
       
   374 	movvs	%icc,	%o5,	$B1
       
   375 	ldx	[$pwrtbl+10*32],%o5
       
   376 	move	%icc,	%o4,	$B0
       
   377 	ldx	[$pwrtbl+3*32],	%o4
       
   378 	move	%icc,	%o5,	$B1
       
   379 	ldx	[$pwrtbl+11*32],%o5
       
   380 	movneg	%icc,	%o4,	$B0
       
   381 	ldx	[$pwrtbl+4*32],	%o4
       
   382 	movneg	%icc,	%o5,	$B1
       
   383 	ldx	[$pwrtbl+12*32],%o5
       
   384 	movcs	%xcc,	%o4,	$B0
       
   385 	ldx	[$pwrtbl+5*32],%o4
       
   386 	movcs	%xcc,	%o5,	$B1
       
   387 	ldx	[$pwrtbl+13*32],%o5
       
   388 	movvs	%xcc,	%o4,	$B0
       
   389 	ldx	[$pwrtbl+6*32],	%o4
       
   390 	movvs	%xcc,	%o5,	$B1
       
   391 	ldx	[$pwrtbl+14*32],%o5
       
   392 	move	%xcc,	%o4,	$B0
       
   393 	ldx	[$pwrtbl+7*32],	%o4
       
   394 	move	%xcc,	%o5,	$B1
       
   395 	ldx	[$pwrtbl+15*32],%o5
       
   396 	movneg	%xcc,	%o4,	$B0
       
   397 	add	$pwrtbl,16*32,	$pwrtbl
       
   398 	movneg	%xcc,	%o5,	$B1
       
   399 ___
       
   400 }
       
   401 sub load_b {
       
   402 my ($pwrtbl,$Bi)=@_;
       
   403 
       
   404 $code.=<<___;
       
   405 	ldx	[$pwrtbl+0*32],	$Bi
       
   406 	ldx	[$pwrtbl+1*32],	%o4
       
   407 	ldx	[$pwrtbl+2*32],	%o5
       
   408 	movvs	%icc,	%o4,	$Bi
       
   409 	ldx	[$pwrtbl+3*32],	%o4
       
   410 	move	%icc,	%o5,	$Bi
       
   411 	ldx	[$pwrtbl+4*32],	%o5
       
   412 	movneg	%icc,	%o4,	$Bi
       
   413 	ldx	[$pwrtbl+5*32],	%o4
       
   414 	movcs	%xcc,	%o5,	$Bi
       
   415 	ldx	[$pwrtbl+6*32],	%o5
       
   416 	movvs	%xcc,	%o4,	$Bi
       
   417 	ldx	[$pwrtbl+7*32],	%o4
       
   418 	move	%xcc,	%o5,	$Bi
       
   419 	add	$pwrtbl,8*32,	$pwrtbl
       
   420 	movneg	%xcc,	%o4,	$Bi
       
   421 ___
       
   422 }
       
   423 
       
   424 ########################################################################
       
   425 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
       
   426 #			   const u64 *pwrtbl,int pwr,int stride);
       
   427 #
       
   428 sub generate_bn_pwr5_mont_t4() {
       
   429 my $NUM=shift;
       
   430 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
       
   431 
       
   432 $code.=<<___;
       
   433 .globl	bn_pwr5_mont_t4_$NUM
       
   434 .align	32
       
   435 bn_pwr5_mont_t4_$NUM:
       
   436 #ifdef	__arch64__
       
   437 	mov	0,$sentinel
       
   438 	mov	-128,%g4
       
   439 #elif defined(SPARCV9_64BIT_STACK)
       
   440 	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
       
   441 	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
       
   442 	mov	-2047,%g4
       
   443 	and	%g1,SPARCV9_64BIT_STACK,%g1
       
   444 	movrz	%g1,0,%g4
       
   445 	mov	-1,$sentinel
       
   446 	add	%g4,-128,%g4
       
   447 #else
       
   448 	mov	-1,$sentinel
       
   449 	mov	-128,%g4
       
   450 #endif
       
   451 	sllx	$sentinel,32,$sentinel
       
   452 	save	%sp,%g4,%sp
       
   453 #ifndef	__arch64__
       
   454 	save	%sp,-128,%sp	! warm it up
       
   455 	save	%sp,-128,%sp
       
   456 	save	%sp,-128,%sp
       
   457 	save	%sp,-128,%sp
       
   458 	save	%sp,-128,%sp
       
   459 	save	%sp,-128,%sp
       
   460 	restore
       
   461 	restore
       
   462 	restore
       
   463 	restore
       
   464 	restore
       
   465 	restore
       
   466 #endif
       
   467 	and	%sp,1,%g4
       
   468 	or	$sentinel,%fp,%fp
       
   469 	or	%g4,$sentinel,$sentinel
       
   470 
       
   471 	! copy arguments to global registers
       
   472 	mov	%i0,$tp
       
   473 	mov	%i1,$np
       
   474 	ld	[%i2+0],%f1	! load *n0
       
   475 	ld	[%i2+4],%f0
       
   476 	mov	%i3,$pwrtbl
       
   477 	srl	%i4,%g0,%i4	! pack last arguments
       
   478 	sllx	%i5,32,$pwr
       
   479 	or	%i4,$pwr,$pwr
       
   480 	fsrc2	%f0,%f60
       
   481 ___
       
   482 
       
   483 # load tp[$NUM] ########################################################
       
   484 $code.=<<___;
       
   485 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   486 ___
       
   487 for($i=0; $i<14 && $i<$NUM; $i++) {
       
   488 $code.=<<___;
       
   489 	ldx	[$tp+$i*8],@A[$i]
       
   490 ___
       
   491 }
       
   492 for(; $i<$NUM; $i++) {
       
   493 $code.=<<___;
       
   494 	ldd	[$tp+$i*8],@A[$i]
       
   495 ___
       
   496 }
       
   497 # load np[$NUM] ########################################################
       
   498 $code.=<<___;
       
   499 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   500 ___
       
   501 for($i=0; $i<14 && $i<$NUM; $i++) {
       
   502 $code.=<<___;
       
   503 	ldx	[$np+$i*8],@N[$i]
       
   504 ___
       
   505 }
       
   506 $code.=<<___;
       
   507 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   508 ___
       
   509 for(; $i<28 && $i<$NUM; $i++) {
       
   510 $code.=<<___;
       
   511 	ldx	[$np+$i*8],@N[$i]
       
   512 ___
       
   513 }
       
   514 $code.=<<___;
       
   515 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   516 ___
       
   517 for(; $i<$NUM; $i++) {
       
   518 $code.=<<___;
       
   519 	ldx	[$np+$i*8],@N[$i]
       
   520 ___
       
   521 }
       
   522 # load pwrtbl[pwr] ########################################################
       
   523 $code.=<<___;
       
   524 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   525 
       
   526 	srlx	$pwr,	32,	%o4		! unpack $pwr
       
   527 	srl	$pwr,	%g0,	%o5
       
   528 	sub	%o4,	5,	%o4
       
   529 	mov	$pwrtbl,	%o7
       
   530 	sllx	%o4,	32,	$pwr		! re-pack $pwr
       
   531 	or	%o5,	$pwr,	$pwr
       
   532 	srl	%o5,	%o4,	%o5
       
   533 ___
       
   534 	&load_ccr("%o7","%o5","%o4");
       
   535 $code.=<<___;
       
   536 	b	.Lstride_$NUM
       
   537 	nop
       
   538 .align	16
       
   539 .Lstride_$NUM:
       
   540 ___
       
   541 for($i=0; $i<14 && $i<$NUM; $i+=2) {
       
   542 	&load_b_pair("%o7",@B[$i],@B[$i+1]);
       
   543 }
       
   544 $code.=<<___;
       
   545 	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
       
   546 ___
       
   547 for(; $i<$NUM; $i+=2) {
       
   548 	&load_b_pair("%i7",@B[$i],@B[$i+1]);
       
   549 }
       
   550 $code.=<<___;
       
   551 	srax	$pwr,	32,	%o4		! unpack $pwr
       
   552 	srl	$pwr,	%g0,	%o5
       
   553 	sub	%o4,	5,	%o4
       
   554 	mov	$pwrtbl,	%i7
       
   555 	sllx	%o4,	32,	$pwr		! re-pack $pwr
       
   556 	or	%o5,	$pwr,	$pwr
       
   557 	srl	%o5,	%o4,	%o5
       
   558 ___
       
   559 	&load_ccr("%i7","%o5","%o4",1);
       
   560 
       
   561 # magic ################################################################
       
   562 for($i=0; $i<5; $i++) {
       
   563 $code.=<<___;
       
   564 	.word	0x81b02940+$NUM-1	! montsqr	$NUM-1
       
   565 	fbu,pn	%fcc3,.Labort_$NUM
       
   566 #ifndef	__arch64__
       
   567 	and	%fp,$sentinel,$sentinel
       
   568 	brz,pn	$sentinel,.Labort_$NUM
       
   569 #endif
       
   570 	nop
       
   571 ___
       
   572 }
       
   573 $code.=<<___;
       
   574 	wr	%o4,	%g0,	%ccr
       
   575 	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
       
   576 	fbu,pn	%fcc3,.Labort_$NUM
       
   577 #ifndef	__arch64__
       
   578 	and	%fp,$sentinel,$sentinel
       
   579 	brz,pn	$sentinel,.Labort_$NUM
       
   580 #endif
       
   581 
       
   582 	srax	$pwr,	32,	%o4
       
   583 #ifdef	__arch64__
       
   584 	brgez	%o4,.Lstride_$NUM
       
   585 	restore
       
   586 	restore
       
   587 	restore
       
   588 	restore
       
   589 	restore
       
   590 #else
       
   591 	brgez	%o4,.Lstride_$NUM
       
   592 	restore;		and	%fp,$sentinel,$sentinel
       
   593 	restore;		and	%fp,$sentinel,$sentinel
       
   594 	restore;		and	%fp,$sentinel,$sentinel
       
   595 	restore;		and	%fp,$sentinel,$sentinel
       
   596 	 brz,pn	$sentinel,.Labort1_$NUM
       
   597 	restore
       
   598 #endif
       
   599 ___
       
   600 
       
   601 # save tp[$NUM] ########################################################
       
   602 for($i=0; $i<14 && $i<$NUM; $i++) {
       
   603 $code.=<<___;
       
   604 	movxtod	@A[$i],@R[$i]
       
   605 ___
       
   606 }
       
   607 $code.=<<___;
       
   608 #ifdef	__arch64__
       
   609 	restore
       
   610 #else
       
   611 	 and	%fp,$sentinel,$sentinel
       
   612 	restore
       
   613 	 and	$sentinel,1,%o7
       
   614 	 and	%fp,$sentinel,$sentinel
       
   615 	 srl	%fp,0,%fp		! just in case?
       
   616 	 or	%o7,$sentinel,$sentinel
       
   617 	brz,a,pn $sentinel,.Ldone_$NUM
       
   618 	mov	0,%i0		! return failure
       
   619 #endif
       
   620 ___
       
   621 for($i=0; $i<$NUM; $i++) {
       
   622 $code.=<<___;
       
   623 	std	@R[$i],[$tp+$i*8]
       
   624 ___
       
   625 }
       
   626 $code.=<<___;
       
   627 	mov	1,%i0		! return success
       
   628 .Ldone_$NUM:
       
   629 	ret
       
   630 	restore
       
   631 
       
   632 .Labort_$NUM:
       
   633 	restore
       
   634 	restore
       
   635 	restore
       
   636 	restore
       
   637 	restore
       
   638 .Labort1_$NUM:
       
   639 	restore
       
   640 
       
   641 	mov	0,%i0		! return failure
       
   642 	ret
       
   643 	restore
       
   644 .type	bn_pwr5_mont_t4_$NUM, #function
       
   645 .size	bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
       
   646 ___
       
   647 }
       
   648 
       
   649 for ($i=8;$i<=32;$i+=8) {
       
   650 	&generate_bn_pwr5_mont_t4($i);
       
   651 }
       
   652 
       
   653 {
       
   654 ########################################################################
       
   655 # Fall-back subroutines
       
   656 #
       
   657 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
       
   658 #
       
   659 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
       
   660 	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
       
   661 
       
   662 # int bn_mul_mont(
       
   663 $rp="%o0";	# u64 *rp,
       
   664 $ap="%o1";	# const u64 *ap,
       
   665 $bp="%o2";	# const u64 *bp,
       
   666 $np="%o3";	# const u64 *np,
       
   667 $n0p="%o4";	# const BN_ULONG *n0,
       
   668 $num="%o5";	# int num);	# caller ensures that num is >=3
       
   669 $code.=<<___;
       
   670 .globl	bn_mul_mont_t4
       
   671 .align	32
       
   672 bn_mul_mont_t4:
       
   673 	add	%sp,	STACK_BIAS,	%g4	! real top of stack
       
   674 	sll	$num,	3,	$num		! size in bytes
       
   675 	add	$num,	63,	%g1
       
   676 	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
       
   677 	sub	%g4,	%g1,	%g1
       
   678 	andn	%g1,	63,	%g1		! align at 64 byte
       
   679 	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
       
   680 	sub	%g1,	%g4,	%g1
       
   681 
       
   682 	save	%sp,	%g1,	%sp
       
   683 ___
       
   684 #	+-------------------------------+<-----	%sp
       
   685 #	.				.
       
   686 #	+-------------------------------+<-----	aligned at 64 bytes
       
   687 #	| __int64 tmp[0]		|
       
   688 #	+-------------------------------+
       
   689 #	.				.
       
   690 #	.				.
       
   691 #	+-------------------------------+<-----	aligned at 64 bytes
       
   692 #	.				.
       
   693 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
       
   694 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
       
   695 ($ovf,$i)=($t0,$t1);
       
   696 $code.=<<___;
       
   697 	ld	[$n0p+0],	$t0	! pull n0[0..1] value
       
   698 	ld	[$n0p+4],	$t1
       
   699 	add	%sp, STACK_BIAS+STACK_FRAME, $tp
       
   700 	ldx	[$bp+0],	$m0	! m0=bp[0]
       
   701 	sllx	$t1,	32,	$n0
       
   702 	add	$bp,	8,	$bp
       
   703 	or	$t0,	$n0,	$n0
       
   704 
       
   705 	ldx	[$ap+0],	$aj	! ap[0]
       
   706 
       
   707 	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
       
   708 	umulxhi	$aj,	$m0,	$hi0
       
   709 
       
   710 	ldx	[$ap+8],	$aj	! ap[1]
       
   711 	add	$ap,	16,	$ap
       
   712 	ldx	[$np+0],	$nj	! np[0]
       
   713 
       
   714 	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
       
   715 
       
   716 	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
       
   717 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
   718 
       
   719 	mulx	$nj,	$m1,	$lo1	! np[0]*m1
       
   720 	umulxhi	$nj,	$m1,	$hi1
       
   721 
       
   722 	ldx	[$np+8],	$nj	! np[1]
       
   723 
       
   724 	addcc	$lo0,	$lo1,	$lo1
       
   725 	add	$np,	16,	$np
       
   726 	addxc	%g0,	$hi1,	$hi1
       
   727 
       
   728 	mulx	$nj,	$m1,	$nlo	! np[1]*m1
       
   729 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
   730 
       
   731 	ba	.L1st
       
   732 	sub	$num,	24,	$cnt	! cnt=num-3
       
   733 
       
   734 .align	16
       
   735 .L1st:
       
   736 	addcc	$alo,	$hi0,	$lo0
       
   737 	addxc	$aj,	%g0,	$hi0
       
   738 
       
   739 	ldx	[$ap+0],	$aj	! ap[j]
       
   740 	addcc	$nlo,	$hi1,	$lo1
       
   741 	add	$ap,	8,	$ap
       
   742 	addxc	$nj,	%g0,	$hi1	! nhi=nj
       
   743 
       
   744 	ldx	[$np+0],	$nj	! np[j]
       
   745 	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
       
   746 	add	$np,	8,	$np
       
   747 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
   748 
       
   749 	mulx	$nj,	$m1,	$nlo	! np[j]*m1
       
   750 	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
       
   751 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
   752 	addxc	%g0,	$hi1,	$hi1
       
   753 	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
       
   754 	add	$tp,	8,	$tp	! tp++
       
   755 
       
   756 	brnz,pt	$cnt,	.L1st
       
   757 	sub	$cnt,	8,	$cnt	! j--
       
   758 !.L1st
       
   759 	addcc	$alo,	$hi0,	$lo0
       
   760 	addxc	$aj,	%g0,	$hi0	! ahi=aj
       
   761 
       
   762 	addcc	$nlo,	$hi1,	$lo1
       
   763 	addxc	$nj,	%g0,	$hi1
       
   764 	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
       
   765 	addxc	%g0,	$hi1,	$hi1
       
   766 	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
       
   767 	add	$tp,	8,	$tp
       
   768 
       
   769 	addcc	$hi0,	$hi1,	$hi1
       
   770 	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
       
   771 	stxa	$hi1,	[$tp]0xe2
       
   772 	add	$tp,	8,	$tp
       
   773 
       
   774 	ba	.Louter
       
   775 	sub	$num,	16,	$i	! i=num-2
       
   776 
       
   777 .align	16
       
   778 .Louter:
       
   779 	ldx	[$bp+0],	$m0	! m0=bp[i]
       
   780 	add	$bp,	8,	$bp
       
   781 
       
   782 	sub	$ap,	$num,	$ap	! rewind
       
   783 	sub	$np,	$num,	$np
       
   784 	sub	$tp,	$num,	$tp
       
   785 
       
   786 	ldx	[$ap+0],	$aj	! ap[0]
       
   787 	ldx	[$np+0],	$nj	! np[0]
       
   788 
       
   789 	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
       
   790 	ldx	[$tp],		$tj	! tp[0]
       
   791 	umulxhi	$aj,	$m0,	$hi0
       
   792 	ldx	[$ap+8],	$aj	! ap[1]
       
   793 	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
       
   794 	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
       
   795 	addxc	%g0,	$hi0,	$hi0
       
   796 	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
       
   797 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
   798 	mulx	$nj,	$m1,	$lo1	! np[0]*m1
       
   799 	add	$ap,	16,	$ap
       
   800 	umulxhi	$nj,	$m1,	$hi1
       
   801 	ldx	[$np+8],	$nj	! np[1]
       
   802 	add	$np,	16,	$np
       
   803 	addcc	$lo1,	$lo0,	$lo1
       
   804 	mulx	$nj,	$m1,	$nlo	! np[1]*m1
       
   805 	addxc	%g0,	$hi1,	$hi1
       
   806 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
   807 
       
   808 	ba	.Linner
       
   809 	sub	$num,	24,	$cnt	! cnt=num-3
       
   810 .align	16
       
   811 .Linner:
       
   812 	addcc	$alo,	$hi0,	$lo0
       
   813 	ldx	[$tp+8],	$tj	! tp[j]
       
   814 	addxc	$aj,	%g0,	$hi0	! ahi=aj
       
   815 	ldx	[$ap+0],	$aj	! ap[j]
       
   816 	add	$ap,	8,	$ap
       
   817 	addcc	$nlo,	$hi1,	$lo1
       
   818 	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
       
   819 	addxc	$nj,	%g0,	$hi1	! nhi=nj
       
   820 	ldx	[$np+0],	$nj	! np[j]
       
   821 	add	$np,	8,	$np
       
   822 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
   823 	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
       
   824 	mulx	$nj,	$m1,	$nlo	! np[j]*m1
       
   825 	addxc	%g0,	$hi0,	$hi0
       
   826 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
   827 	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
       
   828 	addxc	%g0,	$hi1,	$hi1
       
   829 	stx	$lo1,	[$tp]		! tp[j-1]
       
   830 	add	$tp,	8,	$tp
       
   831 	brnz,pt	$cnt,	.Linner
       
   832 	sub	$cnt,	8,	$cnt
       
   833 !.Linner
       
   834 	ldx	[$tp+8],	$tj	! tp[j]
       
   835 	addcc	$alo,	$hi0,	$lo0
       
   836 	addxc	$aj,	%g0,	$hi0	! ahi=aj
       
   837 	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
       
   838 	addxc	%g0,	$hi0,	$hi0
       
   839 
       
   840 	addcc	$nlo,	$hi1,	$lo1
       
   841 	addxc	$nj,	%g0,	$hi1	! nhi=nj
       
   842 	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
       
   843 	addxc	%g0,	$hi1,	$hi1
       
   844 	stx	$lo1,	[$tp]		! tp[j-1]
       
   845 
       
   846 	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
       
   847 	addxccc	$hi1,	$hi0,	$hi1
       
   848 	addxc	%g0,	%g0,	$ovf
       
   849 	stx	$hi1,	[$tp+8]
       
   850 	add	$tp,	16,	$tp
       
   851 
       
   852 	brnz,pt	$i,	.Louter
       
   853 	sub	$i,	8,	$i
       
   854 
       
   855 	sub	$ap,	$num,	$ap	! rewind
       
   856 	sub	$np,	$num,	$np
       
   857 	sub	$tp,	$num,	$tp
       
   858 	ba	.Lsub
       
   859 	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
       
   860 
       
   861 .align	16
       
   862 .Lsub:
       
   863 	ldx	[$tp],		$tj
       
   864 	add	$tp,	8,	$tp
       
   865 	ldx	[$np+0],	$nj
       
   866 	add	$np,	8,	$np
       
   867 	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
       
   868 	srlx	$tj,	32,	$tj
       
   869 	srlx	$nj,	32,	$nj
       
   870 	subccc	$tj,	$nj,	$t3
       
   871 	add	$rp,	8,	$rp
       
   872 	st	$t2,	[$rp-4]		! reverse order
       
   873 	st	$t3,	[$rp-8]
       
   874 	brnz,pt	$cnt,	.Lsub
       
   875 	sub	$cnt,	8,	$cnt
       
   876 
       
   877 	sub	$np,	$num,	$np	! rewind
       
   878 	sub	$tp,	$num,	$tp
       
   879 	sub	$rp,	$num,	$rp
       
   880 
       
   881 	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
       
   882 	and	$tp,	$ovf,	$ap
       
   883 	andn	$rp,	$ovf,	$np
       
   884 	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
       
   885 	ba	.Lcopy
       
   886 	sub	$num,	8,	$cnt
       
   887 
       
   888 .align	16
       
   889 .Lcopy:					! copy or in-place refresh
       
   890 	ldx	[$ap+0],	$t2
       
   891 	add	$ap,	8,	$ap
       
   892 	stx	%g0,	[$tp]		! zap
       
   893 	add	$tp,	8,	$tp
       
   894 	stx	$t2,	[$rp+0]
       
   895 	add	$rp,	8,	$rp
       
   896 	brnz	$cnt,	.Lcopy
       
   897 	sub	$cnt,	8,	$cnt
       
   898 
       
   899 	mov	1,	%o0
       
   900 	ret
       
   901 	restore
       
   902 .type	bn_mul_mont_t4, #function
       
   903 .size	bn_mul_mont_t4, .-bn_mul_mont_t4
       
   904 ___
       
   905 
       
   906 # int bn_mul_mont_gather5(
       
   907 $rp="%o0";	# u64 *rp,
       
   908 $ap="%o1";	# const u64 *ap,
       
   909 $bp="%o2";	# const u64 *pwrtbl,
       
   910 $np="%o3";	# const u64 *np,
       
   911 $n0p="%o4";	# const BN_ULONG *n0,
       
   912 $num="%o5";	# int num,	# caller ensures that num is >=3
       
   913 		# int power);
       
   914 $code.=<<___;
       
   915 .globl	bn_mul_mont_gather5_t4
       
   916 .align	32
       
   917 bn_mul_mont_gather5_t4:
       
   918 	add	%sp,	STACK_BIAS,	%g4	! real top of stack
       
   919 	sll	$num,	3,	$num		! size in bytes
       
   920 	add	$num,	63,	%g1
       
   921 	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
       
   922 	sub	%g4,	%g1,	%g1
       
   923 	andn	%g1,	63,	%g1		! align at 64 byte
       
   924 	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
       
   925 	sub	%g1,	%g4,	%g1
       
   926 	LDPTR	[%sp+STACK_7thARG],	%g4	! load power, 7th argument
       
   927 
       
   928 	save	%sp,	%g1,	%sp
       
   929 ___
       
   930 #	+-------------------------------+<-----	%sp
       
   931 #	.				.
       
   932 #	+-------------------------------+<-----	aligned at 64 bytes
       
   933 #	| __int64 tmp[0]		|
       
   934 #	+-------------------------------+
       
   935 #	.				.
       
   936 #	.				.
       
   937 #	+-------------------------------+<-----	aligned at 64 bytes
       
   938 #	.				.
       
   939 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
       
   940 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
       
   941 ($ovf,$i)=($t0,$t1);
       
   942 	&load_ccr($bp,"%g4",$ccr);
       
   943 	&load_b($bp,$m0,"%o7");		# m0=bp[0]
       
   944 
       
   945 $code.=<<___;
       
   946 	ld	[$n0p+0],	$t0	! pull n0[0..1] value
       
   947 	ld	[$n0p+4],	$t1
       
   948 	add	%sp, STACK_BIAS+STACK_FRAME, $tp
       
   949 	sllx	$t1,	32,	$n0
       
   950 	or	$t0,	$n0,	$n0
       
   951 
       
   952 	ldx	[$ap+0],	$aj	! ap[0]
       
   953 
       
   954 	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
       
   955 	umulxhi	$aj,	$m0,	$hi0
       
   956 
       
   957 	ldx	[$ap+8],	$aj	! ap[1]
       
   958 	add	$ap,	16,	$ap
       
   959 	ldx	[$np+0],	$nj	! np[0]
       
   960 
       
   961 	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
       
   962 
       
   963 	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
       
   964 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
   965 
       
   966 	mulx	$nj,	$m1,	$lo1	! np[0]*m1
       
   967 	umulxhi	$nj,	$m1,	$hi1
       
   968 
       
   969 	ldx	[$np+8],	$nj	! np[1]
       
   970 
       
   971 	addcc	$lo0,	$lo1,	$lo1
       
   972 	add	$np,	16,	$np
       
   973 	addxc	%g0,	$hi1,	$hi1
       
   974 
       
   975 	mulx	$nj,	$m1,	$nlo	! np[1]*m1
       
   976 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
   977 
       
   978 	ba	.L1st_g5
       
   979 	sub	$num,	24,	$cnt	! cnt=num-3
       
   980 
       
   981 .align	16
       
   982 .L1st_g5:
       
   983 	addcc	$alo,	$hi0,	$lo0
       
   984 	addxc	$aj,	%g0,	$hi0
       
   985 
       
   986 	ldx	[$ap+0],	$aj	! ap[j]
       
   987 	addcc	$nlo,	$hi1,	$lo1
       
   988 	add	$ap,	8,	$ap
       
   989 	addxc	$nj,	%g0,	$hi1	! nhi=nj
       
   990 
       
   991 	ldx	[$np+0],	$nj	! np[j]
       
   992 	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
       
   993 	add	$np,	8,	$np
       
   994 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
   995 
       
   996 	mulx	$nj,	$m1,	$nlo	! np[j]*m1
       
   997 	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
       
   998 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
   999 	addxc	%g0,	$hi1,	$hi1
       
  1000 	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
       
  1001 	add	$tp,	8,	$tp	! tp++
       
  1002 
       
  1003 	brnz,pt	$cnt,	.L1st_g5
       
  1004 	sub	$cnt,	8,	$cnt	! j--
       
  1005 !.L1st_g5
       
  1006 	addcc	$alo,	$hi0,	$lo0
       
  1007 	addxc	$aj,	%g0,	$hi0	! ahi=aj
       
  1008 
       
  1009 	addcc	$nlo,	$hi1,	$lo1
       
  1010 	addxc	$nj,	%g0,	$hi1
       
  1011 	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
       
  1012 	addxc	%g0,	$hi1,	$hi1
       
  1013 	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
       
  1014 	add	$tp,	8,	$tp
       
  1015 
       
  1016 	addcc	$hi0,	$hi1,	$hi1
       
  1017 	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
       
  1018 	stxa	$hi1,	[$tp]0xe2
       
  1019 	add	$tp,	8,	$tp
       
  1020 
       
  1021 	ba	.Louter_g5
       
  1022 	sub	$num,	16,	$i	! i=num-2
       
  1023 
       
  1024 .align	16
       
  1025 .Louter_g5:
       
  1026 	wr	$ccr,	%g0,	%ccr
       
  1027 ___
       
  1028 	&load_b($bp,$m0);		# m0=bp[i]
       
  1029 $code.=<<___;
       
  1030 	sub	$ap,	$num,	$ap	! rewind
       
  1031 	sub	$np,	$num,	$np
       
  1032 	sub	$tp,	$num,	$tp
       
  1033 
       
  1034 	ldx	[$ap+0],	$aj	! ap[0]
       
  1035 	ldx	[$np+0],	$nj	! np[0]
       
  1036 
       
  1037 	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
       
  1038 	ldx	[$tp],		$tj	! tp[0]
       
  1039 	umulxhi	$aj,	$m0,	$hi0
       
  1040 	ldx	[$ap+8],	$aj	! ap[1]
       
  1041 	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
       
  1042 	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
       
  1043 	addxc	%g0,	$hi0,	$hi0
       
  1044 	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
       
  1045 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
  1046 	mulx	$nj,	$m1,	$lo1	! np[0]*m1
       
  1047 	add	$ap,	16,	$ap
       
  1048 	umulxhi	$nj,	$m1,	$hi1
       
  1049 	ldx	[$np+8],	$nj	! np[1]
       
  1050 	add	$np,	16,	$np
       
  1051 	addcc	$lo1,	$lo0,	$lo1
       
  1052 	mulx	$nj,	$m1,	$nlo	! np[1]*m1
       
  1053 	addxc	%g0,	$hi1,	$hi1
       
  1054 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
  1055 
       
  1056 	ba	.Linner_g5
       
  1057 	sub	$num,	24,	$cnt	! cnt=num-3
       
  1058 .align	16
       
  1059 .Linner_g5:
       
  1060 	addcc	$alo,	$hi0,	$lo0
       
  1061 	ldx	[$tp+8],	$tj	! tp[j]
       
  1062 	addxc	$aj,	%g0,	$hi0	! ahi=aj
       
  1063 	ldx	[$ap+0],	$aj	! ap[j]
       
  1064 	add	$ap,	8,	$ap
       
  1065 	addcc	$nlo,	$hi1,	$lo1
       
  1066 	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
       
  1067 	addxc	$nj,	%g0,	$hi1	! nhi=nj
       
  1068 	ldx	[$np+0],	$nj	! np[j]
       
  1069 	add	$np,	8,	$np
       
  1070 	umulxhi	$aj,	$m0,	$aj	! ahi=aj
       
  1071 	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
       
  1072 	mulx	$nj,	$m1,	$nlo	! np[j]*m1
       
  1073 	addxc	%g0,	$hi0,	$hi0
       
  1074 	umulxhi	$nj,	$m1,	$nj	! nhi=nj
       
  1075 	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
       
  1076 	addxc	%g0,	$hi1,	$hi1
       
  1077 	stx	$lo1,	[$tp]		! tp[j-1]
       
  1078 	add	$tp,	8,	$tp
       
  1079 	brnz,pt	$cnt,	.Linner_g5
       
  1080 	sub	$cnt,	8,	$cnt
       
  1081 !.Linner_g5
       
  1082 	ldx	[$tp+8],	$tj	! tp[j]
       
  1083 	addcc	$alo,	$hi0,	$lo0
       
  1084 	addxc	$aj,	%g0,	$hi0	! ahi=aj
       
  1085 	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
       
  1086 	addxc	%g0,	$hi0,	$hi0
       
  1087 
       
  1088 	addcc	$nlo,	$hi1,	$lo1
       
  1089 	addxc	$nj,	%g0,	$hi1	! nhi=nj
       
  1090 	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
       
  1091 	addxc	%g0,	$hi1,	$hi1
       
  1092 	stx	$lo1,	[$tp]		! tp[j-1]
       
  1093 
       
  1094 	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
       
  1095 	addxccc	$hi1,	$hi0,	$hi1
       
  1096 	addxc	%g0,	%g0,	$ovf
       
  1097 	stx	$hi1,	[$tp+8]
       
  1098 	add	$tp,	16,	$tp
       
  1099 
       
  1100 	brnz,pt	$i,	.Louter_g5
       
  1101 	sub	$i,	8,	$i
       
  1102 
       
  1103 	sub	$ap,	$num,	$ap	! rewind
       
  1104 	sub	$np,	$num,	$np
       
  1105 	sub	$tp,	$num,	$tp
       
  1106 	ba	.Lsub_g5
       
  1107 	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
       
  1108 
       
  1109 .align	16
       
  1110 .Lsub_g5:
       
  1111 	ldx	[$tp],		$tj
       
  1112 	add	$tp,	8,	$tp
       
  1113 	ldx	[$np+0],	$nj
       
  1114 	add	$np,	8,	$np
       
  1115 	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
       
  1116 	srlx	$tj,	32,	$tj
       
  1117 	srlx	$nj,	32,	$nj
       
  1118 	subccc	$tj,	$nj,	$t3
       
  1119 	add	$rp,	8,	$rp
       
  1120 	st	$t2,	[$rp-4]		! reverse order
       
  1121 	st	$t3,	[$rp-8]
       
  1122 	brnz,pt	$cnt,	.Lsub_g5
       
  1123 	sub	$cnt,	8,	$cnt
       
  1124 
       
  1125 	sub	$np,	$num,	$np	! rewind
       
  1126 	sub	$tp,	$num,	$tp
       
  1127 	sub	$rp,	$num,	$rp
       
  1128 
       
  1129 	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
       
  1130 	and	$tp,	$ovf,	$ap
       
  1131 	andn	$rp,	$ovf,	$np
       
  1132 	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
       
  1133 	ba	.Lcopy_g5
       
  1134 	sub	$num,	8,	$cnt
       
  1135 
       
  1136 .align	16
       
  1137 .Lcopy_g5:				! copy or in-place refresh
       
  1138 	ldx	[$ap+0],	$t2
       
  1139 	add	$ap,	8,	$ap
       
  1140 	stx	%g0,	[$tp]		! zap
       
  1141 	add	$tp,	8,	$tp
       
  1142 	stx	$t2,	[$rp+0]
       
  1143 	add	$rp,	8,	$rp
       
  1144 	brnz	$cnt,	.Lcopy_g5
       
  1145 	sub	$cnt,	8,	$cnt
       
  1146 
       
  1147 	mov	1,	%o0
       
  1148 	ret
       
  1149 	restore
       
  1150 .type	bn_mul_mont_gather5_t4, #function
       
  1151 .size	bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
       
  1152 ___
       
  1153 }
       
  1154 
       
  1155 $code.=<<___;
       
  1156 .globl	bn_flip_t4
       
  1157 .align	32
       
  1158 bn_flip_t4:
       
  1159 .Loop_flip:
       
  1160 	ld	[%o1+0],	%o4
       
  1161 	sub	%o2,	1,	%o2
       
  1162 	ld	[%o1+4],	%o5
       
  1163 	add	%o1,	8,	%o1
       
  1164 	st	%o5,	[%o0+0]
       
  1165 	st	%o4,	[%o0+4]
       
  1166 	brnz	%o2,	.Loop_flip
       
  1167 	add	%o0,	8,	%o0
       
  1168 	retl
       
  1169 	nop
       
  1170 .type	bn_flip_t4, #function
       
  1171 .size	bn_flip_t4, .-bn_flip_t4
       
  1172 
       
  1173 .globl	bn_flip_n_scatter5_t4
       
  1174 .align	32
       
  1175 bn_flip_n_scatter5_t4:
       
  1176 	sll	%o3,	3,	%o3
       
  1177 	srl	%o1,	1,	%o1
       
  1178 	add	%o3,	%o2,	%o2	! &pwrtbl[pwr]
       
  1179 	sub	%o1,	1,	%o1
       
  1180 .Loop_flip_n_scatter5:
       
  1181 	ld	[%o0+0],	%o4	! inp[i]
       
  1182 	ld	[%o0+4],	%o5
       
  1183 	add	%o0,	8,	%o0
       
  1184 	sllx	%o5,	32,	%o5
       
  1185 	or	%o4,	%o5,	%o5
       
  1186 	stx	%o5,	[%o2]
       
  1187 	add	%o2,	32*8,	%o2
       
  1188 	brnz	%o1,	.Loop_flip_n_scatter5
       
  1189 	sub	%o1,	1,	%o1
       
  1190 	retl
       
  1191 	nop
       
  1192 .type	bn_flip_n_scatter5_t4, #function
       
  1193 .size	bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
       
  1194 
       
  1195 .globl	bn_gather5_t4
       
  1196 .align	32
       
  1197 bn_gather5_t4:
       
  1198 ___
       
  1199 	&load_ccr("%o2","%o3","%g1");
       
  1200 $code.=<<___;
       
  1201 	sub	%o1,	1,	%o1
       
  1202 .Loop_gather5:
       
  1203 ___
       
  1204 	&load_b("%o2","%g1");
       
  1205 $code.=<<___;
       
  1206 	stx	%g1,	[%o0]
       
  1207 	add	%o0,	8,	%o0
       
  1208 	brnz	%o1,	.Loop_gather5
       
  1209 	sub	%o1,	1,	%o1
       
  1210 
       
  1211 	retl
       
  1212 	nop
       
  1213 .type	bn_gather5_t4, #function
       
  1214 .size	bn_gather5_t4, .-bn_gather5_t4
       
  1215 
       
  1216 .asciz	"Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
       
  1217 .align	4
       
  1218 ___
       
  1219 
       
  1220 &emit_assembler();
       
  1221 
       
  1222 close STDOUT;