1 #!/usr/bin/env perl |
|
2 # |
|
3 # ==================================================================== |
|
4 # Written by Andy Polyakov <[email protected]> for the OpenSSL |
|
5 # project. The module is, however, dual licensed under OpenSSL and |
|
6 # CRYPTOGAMS licenses depending on where you obtain it. For further |
|
7 # details see http://www.openssl.org/~appro/cryptogams/. |
|
8 # ==================================================================== |
|
9 # |
|
10 # October 2012 |
|
11 # |
|
12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used |
|
13 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for |
|
14 # the time being... Except that it has two code paths: one suitable |
|
15 # for all SPARCv9 processors and one for VIS3-capable ones. Former |
|
16 # delivers ~25-45% more, more for longer keys, heaviest DH and DSA |
|
17 # verify operations on venerable UltraSPARC II. On T4 VIS3 code is |
|
18 # ~100-230% faster than gcc-generated code and ~35-90% faster than |
|
19 # the pure SPARCv9 code path. |
|
20 |
|
21 $bits=32; |
|
22 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } |
|
23 if ($bits==64) { $bias=2047; $frame=192; } |
|
24 else { $bias=0; $frame=112; } |
|
25 |
|
26 $locals=16*8; |
|
27 |
|
28 $code.=<<___; |
|
29 #include <sparc_arch.h> |
|
30 |
|
31 .section ".text",#alloc,#execinstr |
|
32 ___ |
|
33 $code.=<<___ if ($bits==64); |
|
34 .register %g2,#scratch |
|
35 .register %g3,#scratch |
|
36 ___ |
|
37 |
|
38 $tab="%l0"; |
|
39 |
|
40 @T=("%g2","%g3"); |
|
41 @i=("%g4","%g5"); |
|
42 |
|
43 ($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5)); |
|
44 ($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo; |
|
45 |
|
46 $code.=<<___; |
|
47 #ifdef __PIC__ |
|
48 SPARC_PIC_THUNK(%g1) |
|
49 #endif |
|
50 |
|
51 .globl bn_GF2m_mul_2x2 |
|
52 .align 16 |
|
53 bn_GF2m_mul_2x2: |
|
54 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) |
|
55 ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0] |
|
56 |
|
57 andcc %g1, SPARCV9_VIS3, %g0 |
|
58 bz,pn %icc,.Lsoftware |
|
59 nop |
|
60 |
|
61 sllx %o1, 32, %o1 |
|
62 sllx %o3, 32, %o3 |
|
63 or %o2, %o1, %o1 |
|
64 or %o4, %o3, %o3 |
|
65 .word 0x95b262ab ! xmulx %o1, %o3, %o2 |
|
66 .word 0x99b262cb ! xmulxhi %o1, %o3, %o4 |
|
67 srlx %o2, 32, %o1 ! 13 cycles later |
|
68 st %o2, [%o0+0] |
|
69 st %o1, [%o0+4] |
|
70 srlx %o4, 32, %o3 |
|
71 st %o4, [%o0+8] |
|
72 retl |
|
73 st %o3, [%o0+12] |
|
74 |
|
75 .align 16 |
|
76 .Lsoftware: |
|
77 save %sp,-$frame-$locals,%sp |
|
78 |
|
79 sllx %i1,32,$a |
|
80 mov -1,$a12 |
|
81 sllx %i3,32,$b |
|
82 or %i2,$a,$a |
|
83 srlx $a12,1,$a48 ! 0x7fff... |
|
84 or %i4,$b,$b |
|
85 srlx $a12,2,$a12 ! 0x3fff... |
|
86 add %sp,$bias+$frame,$tab |
|
87 |
|
88 sllx $a,2,$a4 |
|
89 mov $a,$a1 |
|
90 sllx $a,1,$a2 |
|
91 |
|
92 srax $a4,63,@i[1] ! broadcast 61st bit |
|
93 and $a48,$a4,$a4 ! (a<<2)&0x7fff... |
|
94 srlx $a48,2,$a48 |
|
95 srax $a2,63,@i[0] ! broadcast 62nd bit |
|
96 and $a12,$a2,$a2 ! (a<<1)&0x3fff... |
|
97 srax $a1,63,$lo ! broadcast 63rd bit |
|
98 and $a48,$a1,$a1 ! (a<<0)&0x1fff... |
|
99 |
|
100 sllx $a1,3,$a8 |
|
101 and $b,$lo,$lo |
|
102 and $b,@i[0],@i[0] |
|
103 and $b,@i[1],@i[1] |
|
104 |
|
105 stx %g0,[$tab+0*8] ! tab[0]=0 |
|
106 xor $a1,$a2,$a12 |
|
107 stx $a1,[$tab+1*8] ! tab[1]=a1 |
|
108 stx $a2,[$tab+2*8] ! tab[2]=a2 |
|
109 xor $a4,$a8,$a48 |
|
110 stx $a12,[$tab+3*8] ! tab[3]=a1^a2 |
|
111 xor $a4,$a1,$a1 |
|
112 |
|
113 stx $a4,[$tab+4*8] ! tab[4]=a4 |
|
114 xor $a4,$a2,$a2 |
|
115 stx $a1,[$tab+5*8] ! tab[5]=a1^a4 |
|
116 xor $a4,$a12,$a12 |
|
117 stx $a2,[$tab+6*8] ! tab[6]=a2^a4 |
|
118 xor $a48,$a1,$a1 |
|
119 stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4 |
|
120 xor $a48,$a2,$a2 |
|
121 |
|
122 stx $a8,[$tab+8*8] ! tab[8]=a8 |
|
123 xor $a48,$a12,$a12 |
|
124 stx $a1,[$tab+9*8] ! tab[9]=a1^a8 |
|
125 xor $a4,$a1,$a1 |
|
126 stx $a2,[$tab+10*8] ! tab[10]=a2^a8 |
|
127 xor $a4,$a2,$a2 |
|
128 stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8 |
|
129 |
|
130 xor $a4,$a12,$a12 |
|
131 stx $a48,[$tab+12*8] ! tab[12]=a4^a8 |
|
132 srlx $lo,1,$hi |
|
133 stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8 |
|
134 sllx $lo,63,$lo |
|
135 stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8 |
|
136 srlx @i[0],2,@T[0] |
|
137 stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8 |
|
138 |
|
139 sllx @i[0],62,$a1 |
|
140 sllx $b,3,@i[0] |
|
141 srlx @i[1],3,@T[1] |
|
142 and @i[0],`0xf<<3`,@i[0] |
|
143 sllx @i[1],61,$a2 |
|
144 ldx [$tab+@i[0]],@i[0] |
|
145 srlx $b,4-3,@i[1] |
|
146 xor @T[0],$hi,$hi |
|
147 and @i[1],`0xf<<3`,@i[1] |
|
148 xor $a1,$lo,$lo |
|
149 ldx [$tab+@i[1]],@i[1] |
|
150 xor @T[1],$hi,$hi |
|
151 |
|
152 xor @i[0],$lo,$lo |
|
153 srlx $b,8-3,@i[0] |
|
154 xor $a2,$lo,$lo |
|
155 and @i[0],`0xf<<3`,@i[0] |
|
156 ___ |
|
157 for($n=1;$n<14;$n++) { |
|
158 $code.=<<___; |
|
159 sllx @i[1],`$n*4`,@T[0] |
|
160 ldx [$tab+@i[0]],@i[0] |
|
161 srlx @i[1],`64-$n*4`,@T[1] |
|
162 xor @T[0],$lo,$lo |
|
163 srlx $b,`($n+2)*4`-3,@i[1] |
|
164 xor @T[1],$hi,$hi |
|
165 and @i[1],`0xf<<3`,@i[1] |
|
166 ___ |
|
167 push(@i,shift(@i)); push(@T,shift(@T)); |
|
168 } |
|
169 $code.=<<___; |
|
170 sllx @i[1],`$n*4`,@T[0] |
|
171 ldx [$tab+@i[0]],@i[0] |
|
172 srlx @i[1],`64-$n*4`,@T[1] |
|
173 xor @T[0],$lo,$lo |
|
174 |
|
175 sllx @i[0],`($n+1)*4`,@T[0] |
|
176 xor @T[1],$hi,$hi |
|
177 srlx @i[0],`64-($n+1)*4`,@T[1] |
|
178 xor @T[0],$lo,$lo |
|
179 xor @T[1],$hi,$hi |
|
180 |
|
181 srlx $lo,32,%i1 |
|
182 st $lo,[%i0+0] |
|
183 st %i1,[%i0+4] |
|
184 srlx $hi,32,%i2 |
|
185 st $hi,[%i0+8] |
|
186 st %i2,[%i0+12] |
|
187 |
|
188 ret |
|
189 restore |
|
190 .type bn_GF2m_mul_2x2,#function |
|
191 .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 |
|
192 .asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" |
|
193 .align 4 |
|
194 ___ |
|
195 |
|
196 $code =~ s/\`([^\`]*)\`/eval($1)/gem; |
|
197 print $code; |
|
198 close STDOUT; |
|