|
1 #!/usr/bin/env perl |
|
2 |
|
3 # ==================================================================== |
|
4 # Written by David S. Miller <[email protected]> and Andy Polyakov |
|
5 # <[email protected]>. The module is licensed under 2-clause BSD |
|
6 # license. November 2012. All rights reserved. |
|
7 # ==================================================================== |
|
8 |
|
9 ###################################################################### |
|
10 # Montgomery squaring-n-multiplication module for SPARC T4. |
|
11 # |
|
12 # The module consists of three parts: |
|
13 # |
|
14 # 1) collection of "single-op" subroutines that perform single |
|
15 # operation, Montgomery squaring or multiplication, on 512-, |
|
16 # 1024-, 1536- and 2048-bit operands; |
|
17 # 2) collection of "multi-op" subroutines that perform 5 squaring and |
|
18 # 1 multiplication operations on operands of above lengths; |
|
19 # 3) fall-back and helper VIS3 subroutines. |
|
20 # |
|
21 # RSA sign is dominated by multi-op subroutine, while RSA verify and |
|
22 # DSA - by single-op. Special note about 4096-bit RSA verify result. |
|
23 # Operands are too long for dedicated hardware and it's handled by |
|
24 # VIS3 code, which is why you don't see any improvement. It's surely |
|
25 # possible to improve it [by deploying 'mpmul' instruction], maybe in |
|
26 # the future... |
|
27 # |
|
28 # Performance improvement. |
|
29 # |
|
30 # 64-bit process, VIS3: |
|
31 # sign verify sign/s verify/s |
|
32 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4 |
|
33 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3 |
|
34 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9 |
|
35 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 |
|
36 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 |
|
37 # |
|
38 # 64-bit process, this module: |
|
39 # sign verify sign/s verify/s |
|
40 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9 |
|
41 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7 |
|
42 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5 |
|
43 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5 |
|
44 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6 |
|
45 # |
|
46 ###################################################################### |
|
47 # 32-bit process, VIS3: |
|
48 # sign verify sign/s verify/s |
|
49 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3 |
|
50 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4 |
|
51 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8 |
|
52 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6 |
|
53 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4 |
|
54 # |
|
55 # 32-bit process, this module: |
|
56 # sign verify sign/s verify/s |
|
57 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0 |
|
58 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7 |
|
59 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4 |
|
60 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2 |
|
61 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2 |
|
62 # |
|
63 # 32-bit code is prone to performance degradation as interrupt rate |
|
64 # dispatched to CPU executing the code grows. This is because in |
|
65 # standard process of handling interrupt in 32-bit process context |
|
66 # upper halves of most integer registers used as input or output are |
|
67 # zeroed. This renders result invalid, and operation has to be re-run. |
|
68 # If CPU is "bothered" with timer interrupts only, the penalty is |
|
69 # hardly measurable. But in order to mitigate this problem for higher |
|
70 # interrupt rates contemporary Linux kernel recognizes biased stack |
|
71 # even in 32-bit process context and preserves full register contents. |
|
72 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb |
|
73 # for details. |
|
74 |
|
75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
|
76 push(@INC,"${dir}","${dir}../../perlasm"); |
|
77 require "sparcv9_modes.pl"; |
|
78 |
|
79 $code.=<<___; |
|
80 #include "sparc_arch.h" |
|
81 |
|
82 #ifdef __arch64__ |
|
83 .register %g2,#scratch |
|
84 .register %g3,#scratch |
|
85 #endif |
|
86 |
|
87 .section ".text",#alloc,#execinstr |
|
88 |
|
89 #ifdef __PIC__ |
|
90 SPARC_PIC_THUNK(%g1) |
|
91 #endif |
|
92 ___ |
|
93 |
|
94 ######################################################################## |
|
95 # Register layout for mont[mul|sqr] instructions. |
|
96 # For details see "Oracle SPARC Architecture 2011" manual at |
|
97 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. |
|
98 # |
|
99 my @R=map("%f".2*$_,(0..11,30,31,12..29)); |
|
100 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); |
|
101 my @A=(@N[0..13],@R[14..31]); |
|
102 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3))); |
|
103 |
|
104 ######################################################################## |
|
105 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, |
|
106 # const u64 *np,const BN_ULONG *n0); |
|
107 # |
|
108 sub generate_bn_mul_mont_t4() { |
|
109 my $NUM=shift; |
|
110 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); |
|
111 |
|
112 $code.=<<___; |
|
113 .globl bn_mul_mont_t4_$NUM |
|
114 .align 32 |
|
115 bn_mul_mont_t4_$NUM: |
|
116 #ifdef __arch64__ |
|
117 mov 0,$sentinel |
|
118 mov -128,%g4 |
|
119 #elif defined(SPARCV9_64BIT_STACK) |
|
120 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) |
|
121 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] |
|
122 mov -2047,%g4 |
|
123 and %g1,SPARCV9_64BIT_STACK,%g1 |
|
124 movrz %g1,0,%g4 |
|
125 mov -1,$sentinel |
|
126 add %g4,-128,%g4 |
|
127 #else |
|
128 mov -1,$sentinel |
|
129 mov -128,%g4 |
|
130 #endif |
|
131 sllx $sentinel,32,$sentinel |
|
132 save %sp,%g4,%sp |
|
133 #ifndef __arch64__ |
|
134 save %sp,-128,%sp ! warm it up |
|
135 save %sp,-128,%sp |
|
136 save %sp,-128,%sp |
|
137 save %sp,-128,%sp |
|
138 save %sp,-128,%sp |
|
139 save %sp,-128,%sp |
|
140 restore |
|
141 restore |
|
142 restore |
|
143 restore |
|
144 restore |
|
145 restore |
|
146 #endif |
|
147 and %sp,1,%g4 |
|
148 or $sentinel,%fp,%fp |
|
149 or %g4,$sentinel,$sentinel |
|
150 |
|
151 ! copy arguments to global registers |
|
152 mov %i0,$rp |
|
153 mov %i1,$ap |
|
154 mov %i2,$bp |
|
155 mov %i3,$np |
|
156 ld [%i4+0],%f1 ! load *n0 |
|
157 ld [%i4+4],%f0 |
|
158 fsrc2 %f0,%f60 |
|
159 ___ |
|
160 |
|
161 # load ap[$NUM] ######################################################## |
|
162 $code.=<<___; |
|
163 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
164 ___ |
|
165 for($i=0; $i<14 && $i<$NUM; $i++) { |
|
166 my $lo=$i<13?@A[$i+1]:"%o7"; |
|
167 $code.=<<___; |
|
168 ld [$ap+$i*8+0],$lo |
|
169 ld [$ap+$i*8+4],@A[$i] |
|
170 sllx @A[$i],32,@A[$i] |
|
171 or $lo,@A[$i],@A[$i] |
|
172 ___ |
|
173 } |
|
174 for(; $i<$NUM; $i++) { |
|
175 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); |
|
176 $code.=<<___; |
|
177 ld [$ap+$i*8+0],$lo |
|
178 ld [$ap+$i*8+4],$hi |
|
179 fsrc2 $hi,@A[$i] |
|
180 ___ |
|
181 } |
|
182 # load np[$NUM] ######################################################## |
|
183 $code.=<<___; |
|
184 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
185 ___ |
|
186 for($i=0; $i<14 && $i<$NUM; $i++) { |
|
187 my $lo=$i<13?@N[$i+1]:"%o7"; |
|
188 $code.=<<___; |
|
189 ld [$np+$i*8+0],$lo |
|
190 ld [$np+$i*8+4],@N[$i] |
|
191 sllx @N[$i],32,@N[$i] |
|
192 or $lo,@N[$i],@N[$i] |
|
193 ___ |
|
194 } |
|
195 $code.=<<___; |
|
196 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
197 ___ |
|
198 for(; $i<28 && $i<$NUM; $i++) { |
|
199 my $lo=$i<27?@N[$i+1]:"%o7"; |
|
200 $code.=<<___; |
|
201 ld [$np+$i*8+0],$lo |
|
202 ld [$np+$i*8+4],@N[$i] |
|
203 sllx @N[$i],32,@N[$i] |
|
204 or $lo,@N[$i],@N[$i] |
|
205 ___ |
|
206 } |
|
207 $code.=<<___; |
|
208 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
209 ___ |
|
210 for(; $i<$NUM; $i++) { |
|
211 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; |
|
212 $code.=<<___; |
|
213 ld [$np+$i*8+0],$lo |
|
214 ld [$np+$i*8+4],@N[$i] |
|
215 sllx @N[$i],32,@N[$i] |
|
216 or $lo,@N[$i],@N[$i] |
|
217 ___ |
|
218 } |
|
219 $code.=<<___; |
|
220 cmp $ap,$bp |
|
221 be SIZE_T_CC,.Lmsquare_$NUM |
|
222 nop |
|
223 ___ |
|
224 |
|
225 # load bp[$NUM] ######################################################## |
|
226 $code.=<<___; |
|
227 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
228 ___ |
|
229 for($i=0; $i<14 && $i<$NUM; $i++) { |
|
230 my $lo=$i<13?@B[$i+1]:"%o7"; |
|
231 $code.=<<___; |
|
232 ld [$bp+$i*8+0],$lo |
|
233 ld [$bp+$i*8+4],@B[$i] |
|
234 sllx @B[$i],32,@B[$i] |
|
235 or $lo,@B[$i],@B[$i] |
|
236 ___ |
|
237 } |
|
238 $code.=<<___; |
|
239 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
240 ___ |
|
241 for(; $i<$NUM; $i++) { |
|
242 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; |
|
243 $code.=<<___; |
|
244 ld [$bp+$i*8+0],$lo |
|
245 ld [$bp+$i*8+4],@B[$i] |
|
246 sllx @B[$i],32,@B[$i] |
|
247 or $lo,@B[$i],@B[$i] |
|
248 ___ |
|
249 } |
|
250 # magic ################################################################ |
|
251 $code.=<<___; |
|
252 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 |
|
253 .Lmresume_$NUM: |
|
254 fbu,pn %fcc3,.Lmabort_$NUM |
|
255 #ifndef __arch64__ |
|
256 and %fp,$sentinel,$sentinel |
|
257 brz,pn $sentinel,.Lmabort_$NUM |
|
258 #endif |
|
259 nop |
|
260 #ifdef __arch64__ |
|
261 restore |
|
262 restore |
|
263 restore |
|
264 restore |
|
265 restore |
|
266 #else |
|
267 restore; and %fp,$sentinel,$sentinel |
|
268 restore; and %fp,$sentinel,$sentinel |
|
269 restore; and %fp,$sentinel,$sentinel |
|
270 restore; and %fp,$sentinel,$sentinel |
|
271 brz,pn $sentinel,.Lmabort1_$NUM |
|
272 restore |
|
273 #endif |
|
274 ___ |
|
275 |
|
276 # save tp[$NUM] ######################################################## |
|
277 for($i=0; $i<14 && $i<$NUM; $i++) { |
|
278 $code.=<<___; |
|
279 movxtod @A[$i],@R[$i] |
|
280 ___ |
|
281 } |
|
282 $code.=<<___; |
|
283 #ifdef __arch64__ |
|
284 restore |
|
285 #else |
|
286 and %fp,$sentinel,$sentinel |
|
287 restore |
|
288 and $sentinel,1,%o7 |
|
289 and %fp,$sentinel,$sentinel |
|
290 srl %fp,0,%fp ! just in case? |
|
291 or %o7,$sentinel,$sentinel |
|
292 brz,a,pn $sentinel,.Lmdone_$NUM |
|
293 mov 0,%i0 ! return failure |
|
294 #endif |
|
295 ___ |
|
296 for($i=0; $i<12 && $i<$NUM; $i++) { |
|
297 @R[$i] =~ /%f([0-9]+)/; |
|
298 my $lo = "%f".($1+1); |
|
299 $code.=<<___; |
|
300 st $lo,[$rp+$i*8+0] |
|
301 st @R[$i],[$rp+$i*8+4] |
|
302 ___ |
|
303 } |
|
304 for(; $i<$NUM; $i++) { |
|
305 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); |
|
306 $code.=<<___; |
|
307 fsrc2 @R[$i],$hi |
|
308 st $lo,[$rp+$i*8+0] |
|
309 st $hi,[$rp+$i*8+4] |
|
310 ___ |
|
311 } |
|
312 $code.=<<___; |
|
313 mov 1,%i0 ! return success |
|
314 .Lmdone_$NUM: |
|
315 ret |
|
316 restore |
|
317 |
|
318 .Lmabort_$NUM: |
|
319 restore |
|
320 restore |
|
321 restore |
|
322 restore |
|
323 restore |
|
324 .Lmabort1_$NUM: |
|
325 restore |
|
326 |
|
327 mov 0,%i0 ! return failure |
|
328 ret |
|
329 restore |
|
330 |
|
331 .align 32 |
|
332 .Lmsquare_$NUM: |
|
333 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
334 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
335 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 |
|
336 ba .Lmresume_$NUM |
|
337 nop |
|
338 .type bn_mul_mont_t4_$NUM, #function |
|
339 .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM |
|
340 ___ |
|
341 } |
|
342 |
|
343 for ($i=8;$i<=32;$i+=8) { |
|
344 &generate_bn_mul_mont_t4($i); |
|
345 } |
|
346 |
|
347 ######################################################################## |
|
348 # |
|
349 sub load_ccr { |
|
350 my ($ptbl,$pwr,$ccr,$skip_wr)=@_; |
|
351 $code.=<<___; |
|
352 srl $pwr, 2, %o4 |
|
353 and $pwr, 3, %o5 |
|
354 and %o4, 7, %o4 |
|
355 sll %o5, 3, %o5 ! offset within first cache line |
|
356 add %o5, $ptbl, $ptbl ! of the pwrtbl |
|
357 or %g0, 1, %o5 |
|
358 sll %o5, %o4, $ccr |
|
359 ___ |
|
360 $code.=<<___ if (!$skip_wr); |
|
361 wr $ccr, %g0, %ccr |
|
362 ___ |
|
363 } |
|
364 sub load_b_pair { |
|
365 my ($pwrtbl,$B0,$B1)=@_; |
|
366 |
|
367 $code.=<<___; |
|
368 ldx [$pwrtbl+0*32], $B0 |
|
369 ldx [$pwrtbl+8*32], $B1 |
|
370 ldx [$pwrtbl+1*32], %o4 |
|
371 ldx [$pwrtbl+9*32], %o5 |
|
372 movvs %icc, %o4, $B0 |
|
373 ldx [$pwrtbl+2*32], %o4 |
|
374 movvs %icc, %o5, $B1 |
|
375 ldx [$pwrtbl+10*32],%o5 |
|
376 move %icc, %o4, $B0 |
|
377 ldx [$pwrtbl+3*32], %o4 |
|
378 move %icc, %o5, $B1 |
|
379 ldx [$pwrtbl+11*32],%o5 |
|
380 movneg %icc, %o4, $B0 |
|
381 ldx [$pwrtbl+4*32], %o4 |
|
382 movneg %icc, %o5, $B1 |
|
383 ldx [$pwrtbl+12*32],%o5 |
|
384 movcs %xcc, %o4, $B0 |
|
385 ldx [$pwrtbl+5*32],%o4 |
|
386 movcs %xcc, %o5, $B1 |
|
387 ldx [$pwrtbl+13*32],%o5 |
|
388 movvs %xcc, %o4, $B0 |
|
389 ldx [$pwrtbl+6*32], %o4 |
|
390 movvs %xcc, %o5, $B1 |
|
391 ldx [$pwrtbl+14*32],%o5 |
|
392 move %xcc, %o4, $B0 |
|
393 ldx [$pwrtbl+7*32], %o4 |
|
394 move %xcc, %o5, $B1 |
|
395 ldx [$pwrtbl+15*32],%o5 |
|
396 movneg %xcc, %o4, $B0 |
|
397 add $pwrtbl,16*32, $pwrtbl |
|
398 movneg %xcc, %o5, $B1 |
|
399 ___ |
|
400 } |
|
401 sub load_b { |
|
402 my ($pwrtbl,$Bi)=@_; |
|
403 |
|
404 $code.=<<___; |
|
405 ldx [$pwrtbl+0*32], $Bi |
|
406 ldx [$pwrtbl+1*32], %o4 |
|
407 ldx [$pwrtbl+2*32], %o5 |
|
408 movvs %icc, %o4, $Bi |
|
409 ldx [$pwrtbl+3*32], %o4 |
|
410 move %icc, %o5, $Bi |
|
411 ldx [$pwrtbl+4*32], %o5 |
|
412 movneg %icc, %o4, $Bi |
|
413 ldx [$pwrtbl+5*32], %o4 |
|
414 movcs %xcc, %o5, $Bi |
|
415 ldx [$pwrtbl+6*32], %o5 |
|
416 movvs %xcc, %o4, $Bi |
|
417 ldx [$pwrtbl+7*32], %o4 |
|
418 move %xcc, %o5, $Bi |
|
419 add $pwrtbl,8*32, $pwrtbl |
|
420 movneg %xcc, %o4, $Bi |
|
421 ___ |
|
422 } |
|
423 |
|
424 ######################################################################## |
|
425 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, |
|
426 # const u64 *pwrtbl,int pwr,int stride); |
|
427 # |
|
428 sub generate_bn_pwr5_mont_t4() { |
|
429 my $NUM=shift; |
|
430 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); |
|
431 |
|
432 $code.=<<___; |
|
433 .globl bn_pwr5_mont_t4_$NUM |
|
434 .align 32 |
|
435 bn_pwr5_mont_t4_$NUM: |
|
436 #ifdef __arch64__ |
|
437 mov 0,$sentinel |
|
438 mov -128,%g4 |
|
439 #elif defined(SPARCV9_64BIT_STACK) |
|
440 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) |
|
441 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] |
|
442 mov -2047,%g4 |
|
443 and %g1,SPARCV9_64BIT_STACK,%g1 |
|
444 movrz %g1,0,%g4 |
|
445 mov -1,$sentinel |
|
446 add %g4,-128,%g4 |
|
447 #else |
|
448 mov -1,$sentinel |
|
449 mov -128,%g4 |
|
450 #endif |
|
451 sllx $sentinel,32,$sentinel |
|
452 save %sp,%g4,%sp |
|
453 #ifndef __arch64__ |
|
454 save %sp,-128,%sp ! warm it up |
|
455 save %sp,-128,%sp |
|
456 save %sp,-128,%sp |
|
457 save %sp,-128,%sp |
|
458 save %sp,-128,%sp |
|
459 save %sp,-128,%sp |
|
460 restore |
|
461 restore |
|
462 restore |
|
463 restore |
|
464 restore |
|
465 restore |
|
466 #endif |
|
467 and %sp,1,%g4 |
|
468 or $sentinel,%fp,%fp |
|
469 or %g4,$sentinel,$sentinel |
|
470 |
|
471 ! copy arguments to global registers |
|
472 mov %i0,$tp |
|
473 mov %i1,$np |
|
474 ld [%i2+0],%f1 ! load *n0 |
|
475 ld [%i2+4],%f0 |
|
476 mov %i3,$pwrtbl |
|
477 srl %i4,%g0,%i4 ! pack last arguments |
|
478 sllx %i5,32,$pwr |
|
479 or %i4,$pwr,$pwr |
|
480 fsrc2 %f0,%f60 |
|
481 ___ |
|
482 |
|
483 # load tp[$NUM] ######################################################## |
|
484 $code.=<<___; |
|
485 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
486 ___ |
|
487 for($i=0; $i<14 && $i<$NUM; $i++) { |
|
488 $code.=<<___; |
|
489 ldx [$tp+$i*8],@A[$i] |
|
490 ___ |
|
491 } |
|
492 for(; $i<$NUM; $i++) { |
|
493 $code.=<<___; |
|
494 ldd [$tp+$i*8],@A[$i] |
|
495 ___ |
|
496 } |
|
497 # load np[$NUM] ######################################################## |
|
498 $code.=<<___; |
|
499 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
500 ___ |
|
501 for($i=0; $i<14 && $i<$NUM; $i++) { |
|
502 $code.=<<___; |
|
503 ldx [$np+$i*8],@N[$i] |
|
504 ___ |
|
505 } |
|
506 $code.=<<___; |
|
507 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
508 ___ |
|
509 for(; $i<28 && $i<$NUM; $i++) { |
|
510 $code.=<<___; |
|
511 ldx [$np+$i*8],@N[$i] |
|
512 ___ |
|
513 } |
|
514 $code.=<<___; |
|
515 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
516 ___ |
|
517 for(; $i<$NUM; $i++) { |
|
518 $code.=<<___; |
|
519 ldx [$np+$i*8],@N[$i] |
|
520 ___ |
|
521 } |
|
522 # load pwrtbl[pwr] ######################################################## |
|
523 $code.=<<___; |
|
524 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
525 |
|
526 srlx $pwr, 32, %o4 ! unpack $pwr |
|
527 srl $pwr, %g0, %o5 |
|
528 sub %o4, 5, %o4 |
|
529 mov $pwrtbl, %o7 |
|
530 sllx %o4, 32, $pwr ! re-pack $pwr |
|
531 or %o5, $pwr, $pwr |
|
532 srl %o5, %o4, %o5 |
|
533 ___ |
|
534 &load_ccr("%o7","%o5","%o4"); |
|
535 $code.=<<___; |
|
536 b .Lstride_$NUM |
|
537 nop |
|
538 .align 16 |
|
539 .Lstride_$NUM: |
|
540 ___ |
|
541 for($i=0; $i<14 && $i<$NUM; $i+=2) { |
|
542 &load_b_pair("%o7",@B[$i],@B[$i+1]); |
|
543 } |
|
544 $code.=<<___; |
|
545 save %sp,-128,%sp; or $sentinel,%fp,%fp |
|
546 ___ |
|
547 for(; $i<$NUM; $i+=2) { |
|
548 &load_b_pair("%i7",@B[$i],@B[$i+1]); |
|
549 } |
|
550 $code.=<<___; |
|
551 srax $pwr, 32, %o4 ! unpack $pwr |
|
552 srl $pwr, %g0, %o5 |
|
553 sub %o4, 5, %o4 |
|
554 mov $pwrtbl, %i7 |
|
555 sllx %o4, 32, $pwr ! re-pack $pwr |
|
556 or %o5, $pwr, $pwr |
|
557 srl %o5, %o4, %o5 |
|
558 ___ |
|
559 &load_ccr("%i7","%o5","%o4",1); |
|
560 |
|
561 # magic ################################################################ |
|
562 for($i=0; $i<5; $i++) { |
|
563 $code.=<<___; |
|
564 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 |
|
565 fbu,pn %fcc3,.Labort_$NUM |
|
566 #ifndef __arch64__ |
|
567 and %fp,$sentinel,$sentinel |
|
568 brz,pn $sentinel,.Labort_$NUM |
|
569 #endif |
|
570 nop |
|
571 ___ |
|
572 } |
|
573 $code.=<<___; |
|
574 wr %o4, %g0, %ccr |
|
575 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 |
|
576 fbu,pn %fcc3,.Labort_$NUM |
|
577 #ifndef __arch64__ |
|
578 and %fp,$sentinel,$sentinel |
|
579 brz,pn $sentinel,.Labort_$NUM |
|
580 #endif |
|
581 |
|
582 srax $pwr, 32, %o4 |
|
583 #ifdef __arch64__ |
|
584 brgez %o4,.Lstride_$NUM |
|
585 restore |
|
586 restore |
|
587 restore |
|
588 restore |
|
589 restore |
|
590 #else |
|
591 brgez %o4,.Lstride_$NUM |
|
592 restore; and %fp,$sentinel,$sentinel |
|
593 restore; and %fp,$sentinel,$sentinel |
|
594 restore; and %fp,$sentinel,$sentinel |
|
595 restore; and %fp,$sentinel,$sentinel |
|
596 brz,pn $sentinel,.Labort1_$NUM |
|
597 restore |
|
598 #endif |
|
599 ___ |
|
600 |
|
601 # save tp[$NUM] ######################################################## |
|
602 for($i=0; $i<14 && $i<$NUM; $i++) { |
|
603 $code.=<<___; |
|
604 movxtod @A[$i],@R[$i] |
|
605 ___ |
|
606 } |
|
607 $code.=<<___; |
|
608 #ifdef __arch64__ |
|
609 restore |
|
610 #else |
|
611 and %fp,$sentinel,$sentinel |
|
612 restore |
|
613 and $sentinel,1,%o7 |
|
614 and %fp,$sentinel,$sentinel |
|
615 srl %fp,0,%fp ! just in case? |
|
616 or %o7,$sentinel,$sentinel |
|
617 brz,a,pn $sentinel,.Ldone_$NUM |
|
618 mov 0,%i0 ! return failure |
|
619 #endif |
|
620 ___ |
|
621 for($i=0; $i<$NUM; $i++) { |
|
622 $code.=<<___; |
|
623 std @R[$i],[$tp+$i*8] |
|
624 ___ |
|
625 } |
|
626 $code.=<<___; |
|
627 mov 1,%i0 ! return success |
|
628 .Ldone_$NUM: |
|
629 ret |
|
630 restore |
|
631 |
|
632 .Labort_$NUM: |
|
633 restore |
|
634 restore |
|
635 restore |
|
636 restore |
|
637 restore |
|
638 .Labort1_$NUM: |
|
639 restore |
|
640 |
|
641 mov 0,%i0 ! return failure |
|
642 ret |
|
643 restore |
|
644 .type bn_pwr5_mont_t4_$NUM, #function |
|
645 .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM |
|
646 ___ |
|
647 } |
|
648 |
|
649 for ($i=8;$i<=32;$i+=8) { |
|
650 &generate_bn_pwr5_mont_t4($i); |
|
651 } |
|
652 |
|
653 { |
|
654 ######################################################################## |
|
655 # Fall-back subroutines |
|
656 # |
|
657 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values |
|
658 # |
|
659 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= |
|
660 (map("%g$_",(1..5)),map("%o$_",(0..5,7))); |
|
661 |
|
662 # int bn_mul_mont( |
|
663 $rp="%o0"; # u64 *rp, |
|
664 $ap="%o1"; # const u64 *ap, |
|
665 $bp="%o2"; # const u64 *bp, |
|
666 $np="%o3"; # const u64 *np, |
|
667 $n0p="%o4"; # const BN_ULONG *n0, |
|
668 $num="%o5"; # int num); # caller ensures that num is >=3 |
|
669 $code.=<<___; |
|
670 .globl bn_mul_mont_t4 |
|
671 .align 32 |
|
672 bn_mul_mont_t4: |
|
673 add %sp, STACK_BIAS, %g4 ! real top of stack |
|
674 sll $num, 3, $num ! size in bytes |
|
675 add $num, 63, %g1 |
|
676 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes |
|
677 sub %g4, %g1, %g1 |
|
678 andn %g1, 63, %g1 ! align at 64 byte |
|
679 sub %g1, STACK_FRAME, %g1 ! new top of stack |
|
680 sub %g1, %g4, %g1 |
|
681 |
|
682 save %sp, %g1, %sp |
|
683 ___ |
|
684 # +-------------------------------+<----- %sp |
|
685 # . . |
|
686 # +-------------------------------+<----- aligned at 64 bytes |
|
687 # | __int64 tmp[0] | |
|
688 # +-------------------------------+ |
|
689 # . . |
|
690 # . . |
|
691 # +-------------------------------+<----- aligned at 64 bytes |
|
692 # . . |
|
693 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); |
|
694 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); |
|
695 ($ovf,$i)=($t0,$t1); |
|
696 $code.=<<___; |
|
697 ld [$n0p+0], $t0 ! pull n0[0..1] value |
|
698 ld [$n0p+4], $t1 |
|
699 add %sp, STACK_BIAS+STACK_FRAME, $tp |
|
700 ldx [$bp+0], $m0 ! m0=bp[0] |
|
701 sllx $t1, 32, $n0 |
|
702 add $bp, 8, $bp |
|
703 or $t0, $n0, $n0 |
|
704 |
|
705 ldx [$ap+0], $aj ! ap[0] |
|
706 |
|
707 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] |
|
708 umulxhi $aj, $m0, $hi0 |
|
709 |
|
710 ldx [$ap+8], $aj ! ap[1] |
|
711 add $ap, 16, $ap |
|
712 ldx [$np+0], $nj ! np[0] |
|
713 |
|
714 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 |
|
715 |
|
716 mulx $aj, $m0, $alo ! ap[1]*bp[0] |
|
717 umulxhi $aj, $m0, $aj ! ahi=aj |
|
718 |
|
719 mulx $nj, $m1, $lo1 ! np[0]*m1 |
|
720 umulxhi $nj, $m1, $hi1 |
|
721 |
|
722 ldx [$np+8], $nj ! np[1] |
|
723 |
|
724 addcc $lo0, $lo1, $lo1 |
|
725 add $np, 16, $np |
|
726 addxc %g0, $hi1, $hi1 |
|
727 |
|
728 mulx $nj, $m1, $nlo ! np[1]*m1 |
|
729 umulxhi $nj, $m1, $nj ! nhi=nj |
|
730 |
|
731 ba .L1st |
|
732 sub $num, 24, $cnt ! cnt=num-3 |
|
733 |
|
734 .align 16 |
|
735 .L1st: |
|
736 addcc $alo, $hi0, $lo0 |
|
737 addxc $aj, %g0, $hi0 |
|
738 |
|
739 ldx [$ap+0], $aj ! ap[j] |
|
740 addcc $nlo, $hi1, $lo1 |
|
741 add $ap, 8, $ap |
|
742 addxc $nj, %g0, $hi1 ! nhi=nj |
|
743 |
|
744 ldx [$np+0], $nj ! np[j] |
|
745 mulx $aj, $m0, $alo ! ap[j]*bp[0] |
|
746 add $np, 8, $np |
|
747 umulxhi $aj, $m0, $aj ! ahi=aj |
|
748 |
|
749 mulx $nj, $m1, $nlo ! np[j]*m1 |
|
750 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] |
|
751 umulxhi $nj, $m1, $nj ! nhi=nj |
|
752 addxc %g0, $hi1, $hi1 |
|
753 stxa $lo1, [$tp]0xe2 ! tp[j-1] |
|
754 add $tp, 8, $tp ! tp++ |
|
755 |
|
756 brnz,pt $cnt, .L1st |
|
757 sub $cnt, 8, $cnt ! j-- |
|
758 !.L1st |
|
759 addcc $alo, $hi0, $lo0 |
|
760 addxc $aj, %g0, $hi0 ! ahi=aj |
|
761 |
|
762 addcc $nlo, $hi1, $lo1 |
|
763 addxc $nj, %g0, $hi1 |
|
764 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] |
|
765 addxc %g0, $hi1, $hi1 |
|
766 stxa $lo1, [$tp]0xe2 ! tp[j-1] |
|
767 add $tp, 8, $tp |
|
768 |
|
769 addcc $hi0, $hi1, $hi1 |
|
770 addxc %g0, %g0, $ovf ! upmost overflow bit |
|
771 stxa $hi1, [$tp]0xe2 |
|
772 add $tp, 8, $tp |
|
773 |
|
774 ba .Louter |
|
775 sub $num, 16, $i ! i=num-2 |
|
776 |
|
777 .align 16 |
|
778 .Louter: |
|
779 ldx [$bp+0], $m0 ! m0=bp[i] |
|
780 add $bp, 8, $bp |
|
781 |
|
782 sub $ap, $num, $ap ! rewind |
|
783 sub $np, $num, $np |
|
784 sub $tp, $num, $tp |
|
785 |
|
786 ldx [$ap+0], $aj ! ap[0] |
|
787 ldx [$np+0], $nj ! np[0] |
|
788 |
|
789 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] |
|
790 ldx [$tp], $tj ! tp[0] |
|
791 umulxhi $aj, $m0, $hi0 |
|
792 ldx [$ap+8], $aj ! ap[1] |
|
793 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] |
|
794 mulx $aj, $m0, $alo ! ap[1]*bp[i] |
|
795 addxc %g0, $hi0, $hi0 |
|
796 mulx $lo0, $n0, $m1 ! tp[0]*n0 |
|
797 umulxhi $aj, $m0, $aj ! ahi=aj |
|
798 mulx $nj, $m1, $lo1 ! np[0]*m1 |
|
799 add $ap, 16, $ap |
|
800 umulxhi $nj, $m1, $hi1 |
|
801 ldx [$np+8], $nj ! np[1] |
|
802 add $np, 16, $np |
|
803 addcc $lo1, $lo0, $lo1 |
|
804 mulx $nj, $m1, $nlo ! np[1]*m1 |
|
805 addxc %g0, $hi1, $hi1 |
|
806 umulxhi $nj, $m1, $nj ! nhi=nj |
|
807 |
|
808 ba .Linner |
|
809 sub $num, 24, $cnt ! cnt=num-3 |
|
810 .align 16 |
|
811 .Linner: |
|
812 addcc $alo, $hi0, $lo0 |
|
813 ldx [$tp+8], $tj ! tp[j] |
|
814 addxc $aj, %g0, $hi0 ! ahi=aj |
|
815 ldx [$ap+0], $aj ! ap[j] |
|
816 add $ap, 8, $ap |
|
817 addcc $nlo, $hi1, $lo1 |
|
818 mulx $aj, $m0, $alo ! ap[j]*bp[i] |
|
819 addxc $nj, %g0, $hi1 ! nhi=nj |
|
820 ldx [$np+0], $nj ! np[j] |
|
821 add $np, 8, $np |
|
822 umulxhi $aj, $m0, $aj ! ahi=aj |
|
823 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] |
|
824 mulx $nj, $m1, $nlo ! np[j]*m1 |
|
825 addxc %g0, $hi0, $hi0 |
|
826 umulxhi $nj, $m1, $nj ! nhi=nj |
|
827 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] |
|
828 addxc %g0, $hi1, $hi1 |
|
829 stx $lo1, [$tp] ! tp[j-1] |
|
830 add $tp, 8, $tp |
|
831 brnz,pt $cnt, .Linner |
|
832 sub $cnt, 8, $cnt |
|
833 !.Linner |
|
834 ldx [$tp+8], $tj ! tp[j] |
|
835 addcc $alo, $hi0, $lo0 |
|
836 addxc $aj, %g0, $hi0 ! ahi=aj |
|
837 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] |
|
838 addxc %g0, $hi0, $hi0 |
|
839 |
|
840 addcc $nlo, $hi1, $lo1 |
|
841 addxc $nj, %g0, $hi1 ! nhi=nj |
|
842 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] |
|
843 addxc %g0, $hi1, $hi1 |
|
844 stx $lo1, [$tp] ! tp[j-1] |
|
845 |
|
846 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc |
|
847 addxccc $hi1, $hi0, $hi1 |
|
848 addxc %g0, %g0, $ovf |
|
849 stx $hi1, [$tp+8] |
|
850 add $tp, 16, $tp |
|
851 |
|
852 brnz,pt $i, .Louter |
|
853 sub $i, 8, $i |
|
854 |
|
855 sub $ap, $num, $ap ! rewind |
|
856 sub $np, $num, $np |
|
857 sub $tp, $num, $tp |
|
858 ba .Lsub |
|
859 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc |
|
860 |
|
861 .align 16 |
|
862 .Lsub: |
|
863 ldx [$tp], $tj |
|
864 add $tp, 8, $tp |
|
865 ldx [$np+0], $nj |
|
866 add $np, 8, $np |
|
867 subccc $tj, $nj, $t2 ! tp[j]-np[j] |
|
868 srlx $tj, 32, $tj |
|
869 srlx $nj, 32, $nj |
|
870 subccc $tj, $nj, $t3 |
|
871 add $rp, 8, $rp |
|
872 st $t2, [$rp-4] ! reverse order |
|
873 st $t3, [$rp-8] |
|
874 brnz,pt $cnt, .Lsub |
|
875 sub $cnt, 8, $cnt |
|
876 |
|
877 sub $np, $num, $np ! rewind |
|
878 sub $tp, $num, $tp |
|
879 sub $rp, $num, $rp |
|
880 |
|
881 subc $ovf, %g0, $ovf ! handle upmost overflow bit |
|
882 and $tp, $ovf, $ap |
|
883 andn $rp, $ovf, $np |
|
884 or $np, $ap, $ap ! ap=borrow?tp:rp |
|
885 ba .Lcopy |
|
886 sub $num, 8, $cnt |
|
887 |
|
888 .align 16 |
|
889 .Lcopy: ! copy or in-place refresh |
|
890 ldx [$ap+0], $t2 |
|
891 add $ap, 8, $ap |
|
892 stx %g0, [$tp] ! zap |
|
893 add $tp, 8, $tp |
|
894 stx $t2, [$rp+0] |
|
895 add $rp, 8, $rp |
|
896 brnz $cnt, .Lcopy |
|
897 sub $cnt, 8, $cnt |
|
898 |
|
899 mov 1, %o0 |
|
900 ret |
|
901 restore |
|
902 .type bn_mul_mont_t4, #function |
|
903 .size bn_mul_mont_t4, .-bn_mul_mont_t4 |
|
904 ___ |
|
905 |
|
906 # int bn_mul_mont_gather5( |
|
907 $rp="%o0"; # u64 *rp, |
|
908 $ap="%o1"; # const u64 *ap, |
|
909 $bp="%o2"; # const u64 *pwrtbl, |
|
910 $np="%o3"; # const u64 *np, |
|
911 $n0p="%o4"; # const BN_ULONG *n0, |
|
912 $num="%o5"; # int num, # caller ensures that num is >=3 |
|
913 # int power); |
|
914 $code.=<<___; |
|
915 .globl bn_mul_mont_gather5_t4 |
|
916 .align 32 |
|
917 bn_mul_mont_gather5_t4: |
|
918 add %sp, STACK_BIAS, %g4 ! real top of stack |
|
919 sll $num, 3, $num ! size in bytes |
|
920 add $num, 63, %g1 |
|
921 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes |
|
922 sub %g4, %g1, %g1 |
|
923 andn %g1, 63, %g1 ! align at 64 byte |
|
924 sub %g1, STACK_FRAME, %g1 ! new top of stack |
|
925 sub %g1, %g4, %g1 |
|
926 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument |
|
927 |
|
928 save %sp, %g1, %sp |
|
929 ___ |
|
930 # +-------------------------------+<----- %sp |
|
931 # . . |
|
932 # +-------------------------------+<----- aligned at 64 bytes |
|
933 # | __int64 tmp[0] | |
|
934 # +-------------------------------+ |
|
935 # . . |
|
936 # . . |
|
937 # +-------------------------------+<----- aligned at 64 bytes |
|
938 # . . |
|
939 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); |
|
940 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7)); |
|
941 ($ovf,$i)=($t0,$t1); |
|
942 &load_ccr($bp,"%g4",$ccr); |
|
943 &load_b($bp,$m0,"%o7"); # m0=bp[0] |
|
944 |
|
945 $code.=<<___; |
|
946 ld [$n0p+0], $t0 ! pull n0[0..1] value |
|
947 ld [$n0p+4], $t1 |
|
948 add %sp, STACK_BIAS+STACK_FRAME, $tp |
|
949 sllx $t1, 32, $n0 |
|
950 or $t0, $n0, $n0 |
|
951 |
|
952 ldx [$ap+0], $aj ! ap[0] |
|
953 |
|
954 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] |
|
955 umulxhi $aj, $m0, $hi0 |
|
956 |
|
957 ldx [$ap+8], $aj ! ap[1] |
|
958 add $ap, 16, $ap |
|
959 ldx [$np+0], $nj ! np[0] |
|
960 |
|
961 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 |
|
962 |
|
963 mulx $aj, $m0, $alo ! ap[1]*bp[0] |
|
964 umulxhi $aj, $m0, $aj ! ahi=aj |
|
965 |
|
966 mulx $nj, $m1, $lo1 ! np[0]*m1 |
|
967 umulxhi $nj, $m1, $hi1 |
|
968 |
|
969 ldx [$np+8], $nj ! np[1] |
|
970 |
|
971 addcc $lo0, $lo1, $lo1 |
|
972 add $np, 16, $np |
|
973 addxc %g0, $hi1, $hi1 |
|
974 |
|
975 mulx $nj, $m1, $nlo ! np[1]*m1 |
|
976 umulxhi $nj, $m1, $nj ! nhi=nj |
|
977 |
|
978 ba .L1st_g5 |
|
979 sub $num, 24, $cnt ! cnt=num-3 |
|
980 |
|
981 .align 16 |
|
982 .L1st_g5: |
|
983 addcc $alo, $hi0, $lo0 |
|
984 addxc $aj, %g0, $hi0 |
|
985 |
|
986 ldx [$ap+0], $aj ! ap[j] |
|
987 addcc $nlo, $hi1, $lo1 |
|
988 add $ap, 8, $ap |
|
989 addxc $nj, %g0, $hi1 ! nhi=nj |
|
990 |
|
991 ldx [$np+0], $nj ! np[j] |
|
992 mulx $aj, $m0, $alo ! ap[j]*bp[0] |
|
993 add $np, 8, $np |
|
994 umulxhi $aj, $m0, $aj ! ahi=aj |
|
995 |
|
996 mulx $nj, $m1, $nlo ! np[j]*m1 |
|
997 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] |
|
998 umulxhi $nj, $m1, $nj ! nhi=nj |
|
999 addxc %g0, $hi1, $hi1 |
|
1000 stxa $lo1, [$tp]0xe2 ! tp[j-1] |
|
1001 add $tp, 8, $tp ! tp++ |
|
1002 |
|
1003 brnz,pt $cnt, .L1st_g5 |
|
1004 sub $cnt, 8, $cnt ! j-- |
|
1005 !.L1st_g5 |
|
1006 addcc $alo, $hi0, $lo0 |
|
1007 addxc $aj, %g0, $hi0 ! ahi=aj |
|
1008 |
|
1009 addcc $nlo, $hi1, $lo1 |
|
1010 addxc $nj, %g0, $hi1 |
|
1011 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] |
|
1012 addxc %g0, $hi1, $hi1 |
|
1013 stxa $lo1, [$tp]0xe2 ! tp[j-1] |
|
1014 add $tp, 8, $tp |
|
1015 |
|
1016 addcc $hi0, $hi1, $hi1 |
|
1017 addxc %g0, %g0, $ovf ! upmost overflow bit |
|
1018 stxa $hi1, [$tp]0xe2 |
|
1019 add $tp, 8, $tp |
|
1020 |
|
1021 ba .Louter_g5 |
|
1022 sub $num, 16, $i ! i=num-2 |
|
1023 |
|
1024 .align 16 |
|
1025 .Louter_g5: |
|
1026 wr $ccr, %g0, %ccr |
|
1027 ___ |
|
1028 &load_b($bp,$m0); # m0=bp[i] |
|
1029 $code.=<<___; |
|
1030 sub $ap, $num, $ap ! rewind |
|
1031 sub $np, $num, $np |
|
1032 sub $tp, $num, $tp |
|
1033 |
|
1034 ldx [$ap+0], $aj ! ap[0] |
|
1035 ldx [$np+0], $nj ! np[0] |
|
1036 |
|
1037 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] |
|
1038 ldx [$tp], $tj ! tp[0] |
|
1039 umulxhi $aj, $m0, $hi0 |
|
1040 ldx [$ap+8], $aj ! ap[1] |
|
1041 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] |
|
1042 mulx $aj, $m0, $alo ! ap[1]*bp[i] |
|
1043 addxc %g0, $hi0, $hi0 |
|
1044 mulx $lo0, $n0, $m1 ! tp[0]*n0 |
|
1045 umulxhi $aj, $m0, $aj ! ahi=aj |
|
1046 mulx $nj, $m1, $lo1 ! np[0]*m1 |
|
1047 add $ap, 16, $ap |
|
1048 umulxhi $nj, $m1, $hi1 |
|
1049 ldx [$np+8], $nj ! np[1] |
|
1050 add $np, 16, $np |
|
1051 addcc $lo1, $lo0, $lo1 |
|
1052 mulx $nj, $m1, $nlo ! np[1]*m1 |
|
1053 addxc %g0, $hi1, $hi1 |
|
1054 umulxhi $nj, $m1, $nj ! nhi=nj |
|
1055 |
|
1056 ba .Linner_g5 |
|
1057 sub $num, 24, $cnt ! cnt=num-3 |
|
1058 .align 16 |
|
1059 .Linner_g5: |
|
1060 addcc $alo, $hi0, $lo0 |
|
1061 ldx [$tp+8], $tj ! tp[j] |
|
1062 addxc $aj, %g0, $hi0 ! ahi=aj |
|
1063 ldx [$ap+0], $aj ! ap[j] |
|
1064 add $ap, 8, $ap |
|
1065 addcc $nlo, $hi1, $lo1 |
|
1066 mulx $aj, $m0, $alo ! ap[j]*bp[i] |
|
1067 addxc $nj, %g0, $hi1 ! nhi=nj |
|
1068 ldx [$np+0], $nj ! np[j] |
|
1069 add $np, 8, $np |
|
1070 umulxhi $aj, $m0, $aj ! ahi=aj |
|
1071 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] |
|
1072 mulx $nj, $m1, $nlo ! np[j]*m1 |
|
1073 addxc %g0, $hi0, $hi0 |
|
1074 umulxhi $nj, $m1, $nj ! nhi=nj |
|
1075 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] |
|
1076 addxc %g0, $hi1, $hi1 |
|
1077 stx $lo1, [$tp] ! tp[j-1] |
|
1078 add $tp, 8, $tp |
|
1079 brnz,pt $cnt, .Linner_g5 |
|
1080 sub $cnt, 8, $cnt |
|
1081 !.Linner_g5 |
|
1082 ldx [$tp+8], $tj ! tp[j] |
|
1083 addcc $alo, $hi0, $lo0 |
|
1084 addxc $aj, %g0, $hi0 ! ahi=aj |
|
1085 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] |
|
1086 addxc %g0, $hi0, $hi0 |
|
1087 |
|
1088 addcc $nlo, $hi1, $lo1 |
|
1089 addxc $nj, %g0, $hi1 ! nhi=nj |
|
1090 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] |
|
1091 addxc %g0, $hi1, $hi1 |
|
1092 stx $lo1, [$tp] ! tp[j-1] |
|
1093 |
|
1094 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc |
|
1095 addxccc $hi1, $hi0, $hi1 |
|
1096 addxc %g0, %g0, $ovf |
|
1097 stx $hi1, [$tp+8] |
|
1098 add $tp, 16, $tp |
|
1099 |
|
1100 brnz,pt $i, .Louter_g5 |
|
1101 sub $i, 8, $i |
|
1102 |
|
1103 sub $ap, $num, $ap ! rewind |
|
1104 sub $np, $num, $np |
|
1105 sub $tp, $num, $tp |
|
1106 ba .Lsub_g5 |
|
1107 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc |
|
1108 |
|
1109 .align 16 |
|
1110 .Lsub_g5: |
|
1111 ldx [$tp], $tj |
|
1112 add $tp, 8, $tp |
|
1113 ldx [$np+0], $nj |
|
1114 add $np, 8, $np |
|
1115 subccc $tj, $nj, $t2 ! tp[j]-np[j] |
|
1116 srlx $tj, 32, $tj |
|
1117 srlx $nj, 32, $nj |
|
1118 subccc $tj, $nj, $t3 |
|
1119 add $rp, 8, $rp |
|
1120 st $t2, [$rp-4] ! reverse order |
|
1121 st $t3, [$rp-8] |
|
1122 brnz,pt $cnt, .Lsub_g5 |
|
1123 sub $cnt, 8, $cnt |
|
1124 |
|
1125 sub $np, $num, $np ! rewind |
|
1126 sub $tp, $num, $tp |
|
1127 sub $rp, $num, $rp |
|
1128 |
|
1129 subc $ovf, %g0, $ovf ! handle upmost overflow bit |
|
1130 and $tp, $ovf, $ap |
|
1131 andn $rp, $ovf, $np |
|
1132 or $np, $ap, $ap ! ap=borrow?tp:rp |
|
1133 ba .Lcopy_g5 |
|
1134 sub $num, 8, $cnt |
|
1135 |
|
1136 .align 16 |
|
1137 .Lcopy_g5: ! copy or in-place refresh |
|
1138 ldx [$ap+0], $t2 |
|
1139 add $ap, 8, $ap |
|
1140 stx %g0, [$tp] ! zap |
|
1141 add $tp, 8, $tp |
|
1142 stx $t2, [$rp+0] |
|
1143 add $rp, 8, $rp |
|
1144 brnz $cnt, .Lcopy_g5 |
|
1145 sub $cnt, 8, $cnt |
|
1146 |
|
1147 mov 1, %o0 |
|
1148 ret |
|
1149 restore |
|
1150 .type bn_mul_mont_gather5_t4, #function |
|
1151 .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4 |
|
1152 ___ |
|
1153 } |
|
1154 |
|
1155 $code.=<<___; |
|
1156 .globl bn_flip_t4 |
|
1157 .align 32 |
|
1158 bn_flip_t4: |
|
1159 .Loop_flip: |
|
1160 ld [%o1+0], %o4 |
|
1161 sub %o2, 1, %o2 |
|
1162 ld [%o1+4], %o5 |
|
1163 add %o1, 8, %o1 |
|
1164 st %o5, [%o0+0] |
|
1165 st %o4, [%o0+4] |
|
1166 brnz %o2, .Loop_flip |
|
1167 add %o0, 8, %o0 |
|
1168 retl |
|
1169 nop |
|
1170 .type bn_flip_t4, #function |
|
1171 .size bn_flip_t4, .-bn_flip_t4 |
|
1172 |
|
1173 .globl bn_flip_n_scatter5_t4 |
|
1174 .align 32 |
|
1175 bn_flip_n_scatter5_t4: |
|
1176 sll %o3, 3, %o3 |
|
1177 srl %o1, 1, %o1 |
|
1178 add %o3, %o2, %o2 ! &pwrtbl[pwr] |
|
1179 sub %o1, 1, %o1 |
|
1180 .Loop_flip_n_scatter5: |
|
1181 ld [%o0+0], %o4 ! inp[i] |
|
1182 ld [%o0+4], %o5 |
|
1183 add %o0, 8, %o0 |
|
1184 sllx %o5, 32, %o5 |
|
1185 or %o4, %o5, %o5 |
|
1186 stx %o5, [%o2] |
|
1187 add %o2, 32*8, %o2 |
|
1188 brnz %o1, .Loop_flip_n_scatter5 |
|
1189 sub %o1, 1, %o1 |
|
1190 retl |
|
1191 nop |
|
1192 .type bn_flip_n_scatter5_t4, #function |
|
1193 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4 |
|
1194 |
|
1195 .globl bn_gather5_t4 |
|
1196 .align 32 |
|
1197 bn_gather5_t4: |
|
1198 ___ |
|
1199 &load_ccr("%o2","%o3","%g1"); |
|
1200 $code.=<<___; |
|
1201 sub %o1, 1, %o1 |
|
1202 .Loop_gather5: |
|
1203 ___ |
|
1204 &load_b("%o2","%g1"); |
|
1205 $code.=<<___; |
|
1206 stx %g1, [%o0] |
|
1207 add %o0, 8, %o0 |
|
1208 brnz %o1, .Loop_gather5 |
|
1209 sub %o1, 1, %o1 |
|
1210 |
|
1211 retl |
|
1212 nop |
|
1213 .type bn_gather5_t4, #function |
|
1214 .size bn_gather5_t4, .-bn_gather5_t4 |
|
1215 |
|
1216 .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" |
|
1217 .align 4 |
|
1218 ___ |
|
1219 |
|
1220 &emit_assembler(); |
|
1221 |
|
1222 close STDOUT; |