--- a/components/openssl/openssl-fips-140/patches/204-fips-by-default.patch Wed Nov 02 10:26:04 2016 -0700
+++ b/components/openssl/openssl-fips-140/patches/204-fips-by-default.patch Wed Nov 02 19:15:09 2016 -0700
@@ -1,5 +1,6 @@
# Developed in house: Solaris specific
# This patch enables FIPS mode in the _init routine.
+# Also, use EVP API instead of low level SHA API
--- a/crypto/cryptlib.c 2016-09-02 14:10:14.157867400 -0700
+++ b/crypto/cryptlib.c 2016-09-02 14:08:38.308229315 -0700
@@ -117,6 +117,8 @@
@@ -26,3 +27,25 @@
(void) pthread_atfork(solaris_fork_prep, solaris_fork_post, solaris_fork_post);
}
+--- a/apps/speed.c 2016-09-26 02:49:07.000000000 -0700
++++ b/apps/speed.c 2016-10-25 11:26:37.455939170 -0700
+@@ -1640,7 +1640,8 @@
+ print_message(names[D_SHA256], c[D_SHA256][j], lengths[j]);
+ Time_F(START);
+ for (count = 0, run = 1; COND(c[D_SHA256][j]); count++)
+- SHA256(buf, lengths[j], sha256);
++ EVP_Digest(buf, (unsigned long)lengths[j], sha256, NULL,
++ EVP_sha256(), NULL);
+ d = Time_F(STOP);
+ print_result(D_SHA256, j, count, d);
+ }
+@@ -1653,7 +1654,8 @@
+ print_message(names[D_SHA512], c[D_SHA512][j], lengths[j]);
+ Time_F(START);
+ for (count = 0, run = 1; COND(c[D_SHA512][j]); count++)
+- SHA512(buf, lengths[j], sha512);
++ EVP_Digest(buf, (unsigned long)lengths[j], sha512, NULL,
++ EVP_sha512(), NULL);
+ d = Time_F(STOP);
+ print_result(D_SHA512, j, count, d);
+ }
--- a/components/openssl/openssl-fips/Makefile Wed Nov 02 10:26:04 2016 -0700
+++ b/components/openssl/openssl-fips/Makefile Wed Nov 02 19:15:09 2016 -0700
@@ -80,6 +80,20 @@
COMPONENT_PRE_CONFIGURE_ACTION = ( cd $(@D); \
$(RM) $(SOURCE_DIR)/test/fips_aes_data; $(CP) -r $(SOURCE_DIR)/* .; )
+COMPONENT_POST_UNPACK_ACTION = \
+ ( echo "Cloning engines..."; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparc_arch.h $(@D)/crypto/; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/aest4-sparcv9.pl $(@D)/crypto/aes/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/dest4-sparcv9.pl $(@D)/crypto/des/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparcv9_modes.pl $(@D)/crypto/perlasm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/vis3-mont.pl $(@D)/crypto/bn/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparcv9-gf2m.pl $(@D)/crypto/bn/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparct4-mont.pl $(@D)/crypto/bn/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/e_des3.c $(@D)/crypto/evp; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/e_aes.c $(@D)/crypto/evp; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sha1-sparcv9.pl $(@D)/crypto/sha/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sha512-sparcv9.pl $(@D)/crypto/sha/asm; )
+
# There is a specific way that must be followed to build the FIPS-140 canister.
# It is "./config fipscanisterbuild; make; make install" and is called a command
# set "U2" in the OpenSSL FIPS-140 User Guide.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/aest4-sparcv9.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,921 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <[email protected]> and Andy Polyakov
+# <[email protected]>. The module is licensed under 2-clause BSD
+# license. October 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# AES for SPARC T4.
+#
+# AES round instructions complete in 3 cycles and can be issued every
+# cycle. It means that round calculations should take 4*rounds cycles,
+# because any given round instruction depends on result of *both*
+# previous instructions:
+#
+# |0 |1 |2 |3 |4
+# |01|01|01|
+# |23|23|23|
+# |01|01|...
+# |23|...
+#
+# Provided that fxor [with IV] takes 3 cycles to complete, critical
+# path length for CBC encrypt would be 3+4*rounds, or in other words
+# it should process one byte in at least (3+4*rounds)/16 cycles. This
+# estimate doesn't account for "collateral" instructions, such as
+# fetching input from memory, xor-ing it with zero-round key and
+# storing the result. Yet, *measured* performance [for data aligned
+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
+#
+# 128-bit key 192- 256-
+# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Out-of-order execution logic managed to fully overlap "collateral"
+# instructions with those on critical path. Amazing!
+#
+# As with Intel AES-NI, question is if it's possible to improve
+# performance of parallelizeable modes by interleaving round
+# instructions. Provided round instruction latency and throughput
+# optimal interleave factor is 2. But can we expect 2x performance
+# improvement? Well, as round instructions can be issued one per
+# cycle, they don't saturate the 2-way issue pipeline and therefore
+# there is room for "collateral" calculations... Yet, 2x speed-up
+# over CBC encrypt remains unattaintable:
+#
+# 128-bit key 192- 256-
+# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
+# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Estimates based on amount of instructions under assumption that
+# round instructions are not pairable with any other instruction
+# suggest that latter is the actual case and pipeline runs
+# underutilized. It should be noted that T4 out-of-order execution
+# logic is so capable that performance gain from 2x interleave is
+# not even impressive, ~7-13% over non-interleaved code, largest
+# for 256-bit keys.
+
+# To anchor to something else, software implementation processes
+# one byte in 29 cycles with 128-bit key on same processor. Intel
+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
+# in 0.93, naturally with AES-NI.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$::evp=1; # if $evp is set to 0, script generates module with
+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
+# points. These however are not fully compatible with openssl/aes.h,
+# because they expect AES_KEY to be aligned at 64-bit boundary. When
+# used through EVP, alignment is arranged at EVP layer. Second thing
+# that is arranged by EVP is at least 32-bit alignment of IV.
+
+######################################################################
+# single-round subroutines
+#
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
+
+$code.=<<___ if ($::abibits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+
+___
+$code.=<<___;
+#include <openssl/fipssyms.h>
+
+.text
+
+.globl aes_t4_encrypt
+.align 32
+aes_t4_encrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Lenc:
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_eround01 %f16, %f4, %f2, %f0
+ aes_eround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Lenc
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ aes_eround01_l %f16, %f4, %f2, %f0
+ aes_eround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_encrypt,#function
+.size aes_t4_encrypt,.-aes_t4_encrypt
+
+.globl aes_t4_decrypt
+.align 32
+aes_t4_decrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Ldec:
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_dround01 %f16, %f4, %f2, %f0
+ aes_dround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Ldec
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ aes_dround01_l %f16, %f4, %f2, %f0
+ aes_dround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_decrypt,#function
+.size aes_t4_decrypt,.-aes_t4_decrypt
+___
+}
+
+######################################################################
+# key setup subroutines
+#
+{
+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
+$code.=<<___;
+.globl aes_t4_set_encrypt_key
+.align 32
+aes_t4_set_encrypt_key:
+.Lset_encrypt_key:
+ and $inp, 7, $tmp
+ alignaddr $inp, %g0, $inp
+ cmp $bits, 192
+ ldd [$inp + 0], %f0
+ bl,pt %icc,.L128
+ ldd [$inp + 8], %f2
+
+ be,pt %icc,.L192
+ ldd [$inp + 16], %f4
+ brz,pt $tmp, .L256aligned
+ ldd [$inp + 24], %f6
+
+ ldd [$inp + 32], %f8
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f8, %f6
+.L256aligned:
+___
+for ($i=0; $i<6; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ aes_kexpand0 %f4, %f2, %f4
+ std %f6, [$out + `32*$i+24`]
+ aes_kexpand2 %f6, %f4, %f6
+___
+}
+$code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ std %f6, [$out + `32*$i+24`]
+ std %f0, [$out + `32*$i+32`]
+ std %f2, [$out + `32*$i+40`]
+
+ mov 14, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L192:
+ brz,pt $tmp, .L192aligned
+ nop
+
+ ldd [$inp + 24], %f6
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+.L192aligned:
+___
+for ($i=0; $i<7; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ aes_kexpand2 %f4, %f2, %f4
+___
+}
+$code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ std %f0, [$out + `24*$i+24`]
+ std %f2, [$out + `24*$i+32`]
+
+ mov 12, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L128:
+ brz,pt $tmp, .L128aligned
+ nop
+
+ ldd [$inp + 16], %f4
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+.L128aligned:
+___
+for ($i=0; $i<10; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ aes_kexpand1 %f0, %f2, $i, %f0
+ std %f2, [$out + `16*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+___
+}
+$code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ std %f2, [$out + `16*$i+8`]
+
+ mov 10, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_encrypt_key,#function
+.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
+
+.globl aes_t4_set_decrypt_key
+.align 32
+aes_t4_set_decrypt_key:
+ mov %o7, %o5
+ call .Lset_encrypt_key
+ nop
+
+ mov %o5, %o7
+ sll $tmp, 4, $inp ! $tmp is number of rounds
+ add $tmp, 2, $tmp
+ add $out, $inp, $inp ! $inp=$out+16*rounds
+ srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
+
+.Lkey_flip:
+ ldd [$out + 0], %f0
+ ldd [$out + 8], %f2
+ ldd [$out + 16], %f4
+ ldd [$out + 24], %f6
+ ldd [$inp + 0], %f8
+ ldd [$inp + 8], %f10
+ ldd [$inp - 16], %f12
+ ldd [$inp - 8], %f14
+ sub $tmp, 1, $tmp
+ std %f0, [$inp + 0]
+ std %f2, [$inp + 8]
+ std %f4, [$inp - 16]
+ std %f6, [$inp - 8]
+ std %f8, [$out + 0]
+ std %f10, [$out + 8]
+ std %f12, [$out + 16]
+ std %f14, [$out + 24]
+ add $out, 32, $out
+ brnz $tmp, .Lkey_flip
+ sub $inp, 32, $inp
+
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_decrypt_key,#function
+.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
+___
+}
+
+{{{
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
+
+$code.=<<___;
+.align 32
+_aes128_encrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f4
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f54, %f4, %f2, %f2
+.type _aes128_encrypt_1x,#function
+.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
+
+.align 32
+_aes128_encrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f8
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01 %f48, %f4, %f6, %f10
+ aes_eround23 %f50, %f4, %f6, %f6
+ aes_eround01_l %f52, %f8, %f2, %f0
+ aes_eround23_l %f54, %f8, %f2, %f2
+ aes_eround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f54, %f10, %f6, %f6
+.type _aes128_encrypt_2x,#function
+.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
+
+.align 32
+_aes128_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<22;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes128_loadkey,#function
+.size _aes128_loadkey,.-_aes128_loadkey
+_aes128_load_enckey=_aes128_loadkey
+_aes128_load_deckey=_aes128_loadkey
+
+___
+
+&alg_cbc_encrypt_implement("aes",128);
+if ($::evp) {
+ &alg_ctr32_implement("aes",128);
+ &alg_xts_implement("aes",128,"en");
+ &alg_xts_implement("aes",128,"de");
+}
+&alg_cbc_decrypt_implement("aes",128);
+
+$code.=<<___;
+.align 32
+_aes128_decrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f4
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f54, %f4, %f2, %f2
+.type _aes128_decrypt_1x,#function
+.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
+
+.align 32
+_aes128_decrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f8
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01 %f48, %f4, %f6, %f10
+ aes_dround23 %f50, %f4, %f6, %f6
+ aes_dround01_l %f52, %f8, %f2, %f0
+ aes_dround23_l %f54, %f8, %f2, %f2
+ aes_dround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f54, %f10, %f6, %f6
+.type _aes128_decrypt_2x,#function
+.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
+___
+
+$code.=<<___;
+.align 32
+_aes192_encrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f4
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f62, %f4, %f2, %f2
+.type _aes192_encrypt_1x,#function
+.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
+
+.align 32
+_aes192_encrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f8
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01 %f56, %f4, %f6, %f10
+ aes_eround23 %f58, %f4, %f6, %f6
+ aes_eround01_l %f60, %f8, %f2, %f0
+ aes_eround23_l %f62, %f8, %f2, %f2
+ aes_eround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f62, %f10, %f6, %f6
+.type _aes192_encrypt_2x,#function
+.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
+
+.align 32
+_aes256_encrypt_1x:
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f4, %f2, %f0
+ aes_eround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f4, %f2, %f0
+ aes_eround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_1x,#function
+.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
+
+.align 32
+_aes256_encrypt_2x:
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f8, %f2, %f0
+ aes_eround23 %f22, %f8, %f2, %f2
+ aes_eround01 %f20, %f10, %f6, %f4
+ aes_eround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f8, %f2, %f0
+ aes_eround23_l %f22, %f8, %f2, %f2
+ aes_eround01_l %f20, %f10, %f6, %f4
+ aes_eround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_2x,#function
+.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
+
+.align 32
+_aes192_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<26;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes192_loadkey,#function
+.size _aes192_loadkey,.-_aes192_loadkey
+_aes256_loadkey=_aes192_loadkey
+_aes192_load_enckey=_aes192_loadkey
+_aes192_load_deckey=_aes192_loadkey
+_aes256_load_enckey=_aes192_loadkey
+_aes256_load_deckey=_aes192_loadkey
+___
+
+&alg_cbc_encrypt_implement("aes",256);
+&alg_cbc_encrypt_implement("aes",192);
+if ($::evp) {
+ &alg_ctr32_implement("aes",256);
+ &alg_xts_implement("aes",256,"en");
+ &alg_xts_implement("aes",256,"de");
+ &alg_ctr32_implement("aes",192);
+}
+&alg_cbc_decrypt_implement("aes",192);
+&alg_cbc_decrypt_implement("aes",256);
+
+$code.=<<___;
+.align 32
+_aes256_decrypt_1x:
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f4, %f2, %f0
+ aes_dround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f4, %f2, %f0
+ aes_dround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_1x,#function
+.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
+
+.align 32
+_aes256_decrypt_2x:
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f8, %f2, %f0
+ aes_dround23 %f22, %f8, %f2, %f2
+ aes_dround01 %f20, %f10, %f6, %f4
+ aes_dround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f8, %f2, %f0
+ aes_dround23_l %f22, %f8, %f2, %f2
+ aes_dround01_l %f20, %f10, %f6, %f4
+ aes_dround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_2x,#function
+.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
+
+.align 32
+_aes192_decrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f4
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f62, %f4, %f2, %f2
+.type _aes192_decrypt_1x,#function
+.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
+
+.align 32
+_aes192_decrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f8
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01 %f56, %f4, %f6, %f10
+ aes_dround23 %f58, %f4, %f6, %f6
+ aes_dround01_l %f60, %f8, %f2, %f0
+ aes_dround23_l %f62, %f8, %f2, %f2
+ aes_dround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f62, %f10, %f6, %f6
+.type _aes192_decrypt_2x,#function
+.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
+___
+}}}
+
+if (!$::evp) {
+$code.=<<___;
+.global AES_encrypt
+AES_encrypt=aes_t4_encrypt
+.global AES_decrypt
+AES_decrypt=aes_t4_decrypt
+.global AES_set_encrypt_key
+.align 32
+AES_set_encrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_encrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_encrypt_key,#function
+.size AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global AES_set_decrypt_key
+.align 32
+AES_set_decrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_decrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_decrypt_key,#function
+.size AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
+
+$code.=<<___;
+.globl AES_cbc_encrypt
+.align 32
+AES_cbc_encrypt:
+ ld [$key + 240], %g1
+ nop
+ brz $enc, .Lcbc_decrypt
+ cmp %g1, 12
+
+ bl,pt %icc, aes128_t4_cbc_encrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_encrypt
+ nop
+ ba aes256_t4_cbc_encrypt
+ nop
+
+.Lcbc_decrypt:
+ bl,pt %icc, aes128_t4_cbc_decrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_decrypt
+ nop
+ ba aes256_t4_cbc_decrypt
+ nop
+.type AES_cbc_encrypt,#function
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+$code.=<<___;
+.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/dest4-sparcv9.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,620 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <[email protected]> and Andy Polyakov
+# <[email protected]>. The module is licensed under 2-clause BSD
+# license. March 2013. All rights reserved.
+# ====================================================================
+
+######################################################################
+# DES for SPARC T4.
+#
+# As with other hardware-assisted ciphers CBC encrypt results [for
+# aligned data] are virtually identical to critical path lengths:
+#
+# DES Triple-DES
+# CBC encrypt 4.14/4.15(*) 11.7/11.7
+# CBC decrypt 1.77/4.11(**) 6.42/7.47
+#
+# (*) numbers after slash are for
+# misaligned data;
+# (**) this is result for largest
+# block size, unlike all other
+# cases smaller blocks results
+# are better[?];
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$code.=<<___ if ($::abibits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+___
+
+$code.=<<___;
+
+#include <openssl/fipssyms.h>
+
+.text
+___
+
+{ my ($inp,$out)=("%o0","%o1");
+
+$code.=<<___;
+.align 32
+.globl des_t4_key_expand
+.type des_t4_key_expand,#function
+des_t4_key_expand:
+ andcc $inp, 0x7, %g0
+ alignaddr $inp, %g0, $inp
+ bz,pt %icc, 1f
+ ldd [$inp + 0x00], %f0
+ ldd [$inp + 0x08], %f2
+ faligndata %f0, %f2, %f0
+1: des_kexpand %f0, 0, %f0
+ des_kexpand %f0, 1, %f2
+ std %f0, [$out + 0x00]
+ des_kexpand %f2, 3, %f6
+ std %f2, [$out + 0x08]
+ des_kexpand %f2, 2, %f4
+ des_kexpand %f6, 3, %f10
+ std %f6, [$out + 0x18]
+ des_kexpand %f6, 2, %f8
+ std %f4, [$out + 0x10]
+ des_kexpand %f10, 3, %f14
+ std %f10, [$out + 0x28]
+ des_kexpand %f10, 2, %f12
+ std %f8, [$out + 0x20]
+ des_kexpand %f14, 1, %f16
+ std %f14, [$out + 0x38]
+ des_kexpand %f16, 3, %f20
+ std %f12, [$out + 0x30]
+ des_kexpand %f16, 2, %f18
+ std %f16, [$out + 0x40]
+ des_kexpand %f20, 3, %f24
+ std %f20, [$out + 0x50]
+ des_kexpand %f20, 2, %f22
+ std %f18, [$out + 0x48]
+ des_kexpand %f24, 3, %f28
+ std %f24, [$out + 0x60]
+ des_kexpand %f24, 2, %f26
+ std %f22, [$out + 0x58]
+ des_kexpand %f28, 1, %f30
+ std %f28, [$out + 0x70]
+ std %f26, [$out + 0x68]
+ retl
+ std %f30, [$out + 0x78]
+.size des_t4_key_expand,.-des_t4_key_expand
+___
+}
+{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
+ my ($ileft,$iright,$omask) = map("%g$_",(1..3));
+
+$code.=<<___;
+.globl des_t4_cbc_encrypt
+.align 32
+des_t4_cbc_encrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+ ld [$ivec + 0], %f0 ! load ivec
+ ld [$ivec + 4], %f1
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x00], %f4 ! load key schedule
+ ldd [$key + 0x08], %f6
+ ldd [$key + 0x10], %f8
+ ldd [$key + 0x18], %f10
+ ldd [$key + 0x20], %f12
+ ldd [$key + 0x28], %f14
+ ldd [$key + 0x30], %f16
+ ldd [$key + 0x38], %f18
+ ldd [$key + 0x40], %f20
+ ldd [$key + 0x48], %f22
+ ldd [$key + 0x50], %f24
+ ldd [$key + 0x58], %f26
+ ldd [$key + 0x60], %f28
+ ldd [$key + 0x68], %f30
+ ldd [$key + 0x70], %f32
+ ldd [$key + 0x78], %f34
+
+.Ldes_cbc_enc_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f2
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ fxor %f2, %f0, %f0 ! ^= ivec
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ des_round %f20, %f22, %f0, %f0
+ des_round %f24, %f26, %f0, %f0
+ des_round %f28, %f30, %f0, %f0
+ des_round %f32, %f34, %f0, %f0
+ des_iip %f0, %f0
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_cbc_enc_loop
+ add $out, 8, $out
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+.Lcbc_abort:
+ retl
+ nop
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~4x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f2 ! handle unaligned output
+
+ stda %f2, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f2, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_cbc_enc_loop+4
+ orn %g0, $omask, $omask
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+.type des_t4_cbc_encrypt,#function
+.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
+
+.globl des_t4_cbc_decrypt
+.align 32
+des_t4_cbc_decrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+ ld [$ivec + 0], %f2 ! load ivec
+ ld [$ivec + 4], %f3
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x78], %f4 ! load key schedule
+ ldd [$key + 0x70], %f6
+ ldd [$key + 0x68], %f8
+ ldd [$key + 0x60], %f10
+ ldd [$key + 0x58], %f12
+ ldd [$key + 0x50], %f14
+ ldd [$key + 0x48], %f16
+ ldd [$key + 0x40], %f18
+ ldd [$key + 0x38], %f20
+ ldd [$key + 0x30], %f22
+ ldd [$key + 0x28], %f24
+ ldd [$key + 0x20], %f26
+ ldd [$key + 0x18], %f28
+ ldd [$key + 0x10], %f30
+ ldd [$key + 0x08], %f32
+ ldd [$key + 0x00], %f34
+
+.Ldes_cbc_dec_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f0
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ des_round %f20, %f22, %f0, %f0
+ des_round %f24, %f26, %f0, %f0
+ des_round %f28, %f30, %f0, %f0
+ des_round %f32, %f34, %f0, %f0
+ des_iip %f0, %f0
+
+ fxor %f2, %f0, %f0 ! ^= ivec
+ movxtod %g4, %f2
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_cbc_dec_loop
+ add $out, 8, $out
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~4x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f0 ! handle unaligned output
+
+ stda %f0, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f0, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_cbc_dec_loop+4
+ orn %g0, $omask, $omask
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+.type des_t4_cbc_decrypt,#function
+.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
+___
+
+# One might wonder why does one have back-to-back des_iip/des_ip
+# pairs between EDE passes. Indeed, aren't they inverse of each other?
+# They almost are. Outcome of the pair is 32-bit words being swapped
+# in target register. Consider pair of des_iip/des_ip as a way to
+# perform the due swap, it's actually fastest way in this case.
+
+$code.=<<___;
+.globl des_t4_ede3_cbc_encrypt
+.align 32
+des_t4_ede3_cbc_encrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+ ld [$ivec + 0], %f0 ! load ivec
+ ld [$ivec + 4], %f1
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x00], %f4 ! load key schedule
+ ldd [$key + 0x08], %f6
+ ldd [$key + 0x10], %f8
+ ldd [$key + 0x18], %f10
+ ldd [$key + 0x20], %f12
+ ldd [$key + 0x28], %f14
+ ldd [$key + 0x30], %f16
+ ldd [$key + 0x38], %f18
+ ldd [$key + 0x40], %f20
+ ldd [$key + 0x48], %f22
+ ldd [$key + 0x50], %f24
+ ldd [$key + 0x58], %f26
+ ldd [$key + 0x60], %f28
+ ldd [$key + 0x68], %f30
+ ldd [$key + 0x70], %f32
+ ldd [$key + 0x78], %f34
+
+.Ldes_ede3_cbc_enc_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f2
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ fxor %f2, %f0, %f0 ! ^= ivec
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ ldd [$key + 0x100-0x08], %f36
+ ldd [$key + 0x100-0x10], %f38
+ des_round %f20, %f22, %f0, %f0
+ ldd [$key + 0x100-0x18], %f40
+ ldd [$key + 0x100-0x20], %f42
+ des_round %f24, %f26, %f0, %f0
+ ldd [$key + 0x100-0x28], %f44
+ ldd [$key + 0x100-0x30], %f46
+ des_round %f28, %f30, %f0, %f0
+ ldd [$key + 0x100-0x38], %f48
+ ldd [$key + 0x100-0x40], %f50
+ des_round %f32, %f34, %f0, %f0
+ ldd [$key + 0x100-0x48], %f52
+ ldd [$key + 0x100-0x50], %f54
+ des_iip %f0, %f0
+
+ ldd [$key + 0x100-0x58], %f56
+ ldd [$key + 0x100-0x60], %f58
+ des_ip %f0, %f0
+ ldd [$key + 0x100-0x68], %f60
+ ldd [$key + 0x100-0x70], %f62
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x100-0x78], %f36
+ ldd [$key + 0x100-0x80], %f38
+ des_round %f40, %f42, %f0, %f0
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ ldd [$key + 0x100+0x00], %f40
+ ldd [$key + 0x100+0x08], %f42
+ des_round %f52, %f54, %f0, %f0
+ ldd [$key + 0x100+0x10], %f44
+ ldd [$key + 0x100+0x18], %f46
+ des_round %f56, %f58, %f0, %f0
+ ldd [$key + 0x100+0x20], %f48
+ ldd [$key + 0x100+0x28], %f50
+ des_round %f60, %f62, %f0, %f0
+ ldd [$key + 0x100+0x30], %f52
+ ldd [$key + 0x100+0x38], %f54
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x100+0x40], %f56
+ ldd [$key + 0x100+0x48], %f58
+ des_iip %f0, %f0
+
+ ldd [$key + 0x100+0x50], %f60
+ ldd [$key + 0x100+0x58], %f62
+ des_ip %f0, %f0
+ ldd [$key + 0x100+0x60], %f36
+ ldd [$key + 0x100+0x68], %f38
+ des_round %f40, %f42, %f0, %f0
+ ldd [$key + 0x100+0x70], %f40
+ ldd [$key + 0x100+0x78], %f42
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ des_round %f52, %f54, %f0, %f0
+ des_round %f56, %f58, %f0, %f0
+ des_round %f60, %f62, %f0, %f0
+ des_round %f36, %f38, %f0, %f0
+ des_round %f40, %f42, %f0, %f0
+ des_iip %f0, %f0
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_ede3_cbc_enc_loop
+ add $out, 8, $out
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~2x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f2 ! handle unaligned output
+
+ stda %f2, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f2, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_ede3_cbc_enc_loop+4
+ orn %g0, $omask, $omask
+
+ st %f0, [$ivec + 0] ! write out ivec
+ retl
+ st %f1, [$ivec + 4]
+.type des_t4_ede3_cbc_encrypt,#function
+.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
+
+.globl des_t4_ede3_cbc_decrypt
+.align 32
+des_t4_ede3_cbc_decrypt:
+ cmp $len, 0
+ be,pn $::size_t_cc, .Lcbc_abort
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+ ld [$ivec + 0], %f2 ! load ivec
+ ld [$ivec + 4], %f3
+
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 0xff, $omask
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ sub %g0, $ileft, $iright
+ and $out, 7, %g4
+ alignaddrl $out, %g0, $out
+ srl $omask, %g4, $omask
+ srlx $len, 3, $len
+ movrz %g4, 0, $omask
+ prefetch [$out], 22
+
+ ldd [$key + 0x100+0x78], %f4 ! load key schedule
+ ldd [$key + 0x100+0x70], %f6
+ ldd [$key + 0x100+0x68], %f8
+ ldd [$key + 0x100+0x60], %f10
+ ldd [$key + 0x100+0x58], %f12
+ ldd [$key + 0x100+0x50], %f14
+ ldd [$key + 0x100+0x48], %f16
+ ldd [$key + 0x100+0x40], %f18
+ ldd [$key + 0x100+0x38], %f20
+ ldd [$key + 0x100+0x30], %f22
+ ldd [$key + 0x100+0x28], %f24
+ ldd [$key + 0x100+0x20], %f26
+ ldd [$key + 0x100+0x18], %f28
+ ldd [$key + 0x100+0x10], %f30
+ ldd [$key + 0x100+0x08], %f32
+ ldd [$key + 0x100+0x00], %f34
+
+.Ldes_ede3_cbc_dec_loop:
+ ldx [$inp + 0], %g4
+ brz,pt $ileft, 4f
+ nop
+
+ ldx [$inp + 8], %g5
+ sllx %g4, $ileft, %g4
+ srlx %g5, $iright, %g5
+ or %g5, %g4, %g4
+4:
+ movxtod %g4, %f0
+ prefetch [$inp + 8+63], 20
+ add $inp, 8, $inp
+ prefetch [$out + 63], 22
+
+ des_ip %f0, %f0
+ des_round %f4, %f6, %f0, %f0
+ des_round %f8, %f10, %f0, %f0
+ des_round %f12, %f14, %f0, %f0
+ des_round %f16, %f18, %f0, %f0
+ ldd [$key + 0x80+0x00], %f36
+ ldd [$key + 0x80+0x08], %f38
+ des_round %f20, %f22, %f0, %f0
+ ldd [$key + 0x80+0x10], %f40
+ ldd [$key + 0x80+0x18], %f42
+ des_round %f24, %f26, %f0, %f0
+ ldd [$key + 0x80+0x20], %f44
+ ldd [$key + 0x80+0x28], %f46
+ des_round %f28, %f30, %f0, %f0
+ ldd [$key + 0x80+0x30], %f48
+ ldd [$key + 0x80+0x38], %f50
+ des_round %f32, %f34, %f0, %f0
+ ldd [$key + 0x80+0x40], %f52
+ ldd [$key + 0x80+0x48], %f54
+ des_iip %f0, %f0
+
+ ldd [$key + 0x80+0x50], %f56
+ ldd [$key + 0x80+0x58], %f58
+ des_ip %f0, %f0
+ ldd [$key + 0x80+0x60], %f60
+ ldd [$key + 0x80+0x68], %f62
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x80+0x70], %f36
+ ldd [$key + 0x80+0x78], %f38
+ des_round %f40, %f42, %f0, %f0
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ ldd [$key + 0x80-0x08], %f40
+ ldd [$key + 0x80-0x10], %f42
+ des_round %f52, %f54, %f0, %f0
+ ldd [$key + 0x80-0x18], %f44
+ ldd [$key + 0x80-0x20], %f46
+ des_round %f56, %f58, %f0, %f0
+ ldd [$key + 0x80-0x28], %f48
+ ldd [$key + 0x80-0x30], %f50
+ des_round %f60, %f62, %f0, %f0
+ ldd [$key + 0x80-0x38], %f52
+ ldd [$key + 0x80-0x40], %f54
+ des_round %f36, %f38, %f0, %f0
+ ldd [$key + 0x80-0x48], %f56
+ ldd [$key + 0x80-0x50], %f58
+ des_iip %f0, %f0
+
+ ldd [$key + 0x80-0x58], %f60
+ ldd [$key + 0x80-0x60], %f62
+ des_ip %f0, %f0
+ ldd [$key + 0x80-0x68], %f36
+ ldd [$key + 0x80-0x70], %f38
+ des_round %f40, %f42, %f0, %f0
+ ldd [$key + 0x80-0x78], %f40
+ ldd [$key + 0x80-0x80], %f42
+ des_round %f44, %f46, %f0, %f0
+ des_round %f48, %f50, %f0, %f0
+ des_round %f52, %f54, %f0, %f0
+ des_round %f56, %f58, %f0, %f0
+ des_round %f60, %f62, %f0, %f0
+ des_round %f36, %f38, %f0, %f0
+ des_round %f40, %f42, %f0, %f0
+ des_iip %f0, %f0
+
+ fxor %f2, %f0, %f0 ! ^= ivec
+ movxtod %g4, %f2
+
+ brnz,pn $omask, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ brnz,pt $len, .Ldes_ede3_cbc_dec_loop
+ add $out, 8, $out
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+
+.align 16
+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f0 ! handle unaligned output
+
+ stda %f0, [$out + $omask]0xc0 ! partial store
+ add $out, 8, $out
+ orn %g0, $omask, $omask
+ stda %f0, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .Ldes_ede3_cbc_dec_loop+4
+ orn %g0, $omask, $omask
+
+ st %f2, [$ivec + 0] ! write out ivec
+ retl
+ st %f3, [$ivec + 4]
+.type des_t4_ede3_cbc_decrypt,#function
+.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
+___
+}
+$code.=<<___;
+.asciz "DES for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/e_aes.c Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,1922 @@
+/* ====================================================================
+ * Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_AES
+#include <openssl/crypto.h>
+# include <openssl/evp.h>
+# include <openssl/err.h>
+# include <string.h>
+# include <assert.h>
+# include <openssl/aes.h>
+# include "evp_locl.h"
+# include "modes_lcl.h"
+# include <openssl/rand.h>
+
+typedef struct {
+ union {
+ double align;
+ AES_KEY ks;
+ } ks;
+ block128_f block;
+ union {
+ cbc128_f cbc;
+ ctr128_f ctr;
+ } stream;
+} EVP_AES_KEY;
+
+typedef struct {
+ union {
+ double align;
+ AES_KEY ks;
+ } ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ GCM128_CONTEXT gcm;
+ unsigned char *iv; /* Temporary IV store */
+ int ivlen; /* IV length */
+ int taglen;
+ int iv_gen; /* It is OK to generate IVs */
+ int tls_aad_len; /* TLS AAD length */
+ ctr128_f ctr;
+} EVP_AES_GCM_CTX;
+
+typedef struct {
+ union {
+ double align;
+ AES_KEY ks;
+ } ks1, ks2; /* AES key schedules to use */
+ XTS128_CONTEXT xts;
+ void (*stream) (const unsigned char *in,
+ unsigned char *out, size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+} EVP_AES_XTS_CTX;
+
+typedef struct {
+ union {
+ double align;
+ AES_KEY ks;
+ } ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ int tag_set; /* Set if tag is valid */
+ int len_set; /* Set if message length set */
+ int L, M; /* L and M parameters from RFC3610 */
+ CCM128_CONTEXT ccm;
+ ccm128_f str;
+} EVP_AES_CCM_CTX;
+
+# define MAXBITCHUNK ((size_t)1<<(sizeof(size_t)*8-4))
+
+# ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key, unsigned char *ivec, int enc);
+# endif
+# ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char iv[16]);
+# endif
+# ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ const unsigned char ivec[AES_BLOCK_SIZE]);
+# endif
+# ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp, char *out, size_t len,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp, char *out, size_t len,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+# endif
+
+# if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+# include "ppc_arch.h"
+# ifdef VPAES_ASM
+# define VPAES_CAPABLE (OPENSSL_ppccap_P & PPC_ALTIVEC)
+# endif
+# define HWAES_CAPABLE (OPENSSL_ppccap_P & PPC_CRYPTO207)
+# define HWAES_set_encrypt_key aes_p8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_p8_set_decrypt_key
+# define HWAES_encrypt aes_p8_encrypt
+# define HWAES_decrypt aes_p8_decrypt
+# define HWAES_cbc_encrypt aes_p8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
+# endif
+
+# if defined(AES_ASM) && !defined(I386_ONLY) && ( \
+ ((defined(__i386) || defined(__i386__) || \
+ defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) )
+
+extern unsigned int OPENSSL_ia32cap_P[];
+
+# ifdef VPAES_ASM
+# define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+# endif
+# ifdef BSAES_ASM
+# define BSAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+# endif
+/*
+ * AES-NI section
+ */
+# define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length, const AES_KEY *key, int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key, unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key, const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks(const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char ivec[16],
+ unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks(const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char ivec[16],
+ unsigned char cmac[16]);
+
+# if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+size_t aesni_gcm_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t len,
+ const void *key, unsigned char ivec[16], u64 *Xi);
+# define AES_gcm_encrypt aesni_gcm_encrypt
+size_t aesni_gcm_decrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t len,
+ const void *key, unsigned char ivec[16], u64 *Xi);
+# define AES_gcm_decrypt aesni_gcm_decrypt
+void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *in,
+ size_t len);
+# define AES_GCM_ASM(gctx) (gctx->ctr==aesni_ctr32_encrypt_blocks && \
+ gctx->gcm.ghash==gcm_ghash_avx)
+# define AES_GCM_ASM2(gctx) (gctx->gcm.block==(block128_f)aesni_encrypt && \
+ gctx->gcm.ghash==gcm_ghash_avx)
+# undef AES_GCM_ASM2 /* minor size optimization */
+# endif
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ int ret, mode;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc) {
+ ret = aesni_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+ dat->block = (block128_f) aesni_decrypt;
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) aesni_cbc_encrypt : NULL;
+ } else {
+ ret = aesni_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+ dat->block = (block128_f) aesni_encrypt;
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) aesni_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) aesni_ctr32_encrypt_blocks;
+ else
+ dat->stream.cbc = NULL;
+ }
+
+ if (ret < 0) {
+ EVPerr(EVP_F_AESNI_INIT_KEY, EVP_R_AES_KEY_SETUP_FAILED);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ aesni_cbc_encrypt(in, out, len, ctx->cipher_data, ctx->iv, ctx->encrypt);
+
+ return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+
+ if (len < bl)
+ return 1;
+
+ aesni_ecb_encrypt(in, out, len, ctx->cipher_data, ctx->encrypt);
+
+ return 1;
+}
+
+# define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f) aesni_encrypt);
+ gctx->ctr = (ctr128_f) aesni_ctr32_encrypt_blocks;
+ /*
+ * If we have an iv can set it directly, otherwise use saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv) {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ } else {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+}
+
+# define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key) {
+ /* key_len is two AES keys */
+ if (enc) {
+ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) aesni_encrypt;
+ xctx->stream = aesni_xts_encrypt;
+ } else {
+ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) aesni_decrypt;
+ xctx->stream = aesni_xts_decrypt;
+ }
+
+ aesni_set_encrypt_key(key + ctx->key_len / 2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) aesni_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ }
+
+ if (iv) {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+}
+
+# define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f) aesni_encrypt);
+ cctx->str = enc ? (ccm128_f) aesni_ccm64_encrypt_blocks :
+ (ccm128_f) aesni_ccm64_decrypt_blocks;
+ cctx->key_set = 1;
+ }
+ if (iv) {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+}
+
+# define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aesni_init_key, \
+ aesni_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize, \
+ keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aesni_##mode##_init_key, \
+ aesni_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# elif defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+
+# include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+# define SPARC_AES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_AES)
+
+void aes_t4_set_encrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
+void aes_t4_set_decrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
+void aes_t4_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void aes_t4_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+/*
+ * Key-length specific subroutines were chosen for following reason.
+ * Each SPARC T4 core can execute up to 8 threads which share core's
+ * resources. Loading as much key material to registers allows to
+ * minimize references to shared memory interface, as well as amount
+ * of instructions in inner loops [much needed on T4]. But then having
+ * non-key-length specific routines would require conditional branches
+ * either in inner loops or on subroutines' entries. Former is hardly
+ * acceptable, while latter means code size increase to size occupied
+ * by multiple key-length specfic subroutines, so why fight?
+ */
+void aes128_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes128_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes192_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes192_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes256_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes256_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ unsigned char *ivec);
+void aes128_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ unsigned char *ivec);
+void aes192_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ unsigned char *ivec);
+void aes256_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ unsigned char *ivec);
+void aes128_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+void aes128_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+void aes256_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+void aes256_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char *ivec);
+
+static int aes_t4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ int ret, mode, bits;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ bits = ctx->key_len * 8;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc) {
+ ret = 0;
+ aes_t4_set_decrypt_key(key, bits, ctx->cipher_data);
+ dat->block = (block128_f) aes_t4_decrypt;
+ switch (bits) {
+ case 128:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) aes128_t4_cbc_decrypt : NULL;
+ break;
+ case 192:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) aes192_t4_cbc_decrypt : NULL;
+ break;
+ case 256:
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) aes256_t4_cbc_decrypt : NULL;
+ break;
+ default:
+ ret = -1;
+ }
+ } else {
+ ret = 0;
+ aes_t4_set_encrypt_key(key, bits, ctx->cipher_data);
+ dat->block = (block128_f) aes_t4_encrypt;
+ switch (bits) {
+ case 128:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) aes128_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) aes128_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ case 192:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) aes192_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) aes192_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ case 256:
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) aes256_t4_cbc_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) aes256_t4_ctr32_encrypt;
+ else
+ dat->stream.cbc = NULL;
+ break;
+ default:
+ ret = -1;
+ }
+ }
+
+ if (ret < 0) {
+ EVPerr(EVP_F_AES_T4_INIT_KEY, EVP_R_AES_KEY_SETUP_FAILED);
+ return 0;
+ }
+
+ return 1;
+}
+
+# define aes_t4_cbc_cipher aes_cbc_cipher
+static int aes_t4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_ecb_cipher aes_ecb_cipher
+static int aes_t4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_ofb_cipher aes_ofb_cipher
+static int aes_t4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_cfb_cipher aes_cfb_cipher
+static int aes_t4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_cfb8_cipher aes_cfb8_cipher
+static int aes_t4_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_cfb1_cipher aes_cfb1_cipher
+static int aes_t4_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define aes_t4_ctr_cipher aes_ctr_cipher
+static int aes_t4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aes_t4_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ int bits = ctx->key_len * 8;
+ aes_t4_set_encrypt_key(key, bits, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f) aes_t4_encrypt);
+ switch (bits) {
+ case 128:
+ gctx->ctr = (ctr128_f) aes128_t4_ctr32_encrypt;
+ break;
+ case 192:
+ gctx->ctr = (ctr128_f) aes192_t4_ctr32_encrypt;
+ break;
+ case 256:
+ gctx->ctr = (ctr128_f) aes256_t4_ctr32_encrypt;
+ break;
+ default:
+ return 0;
+ }
+ /*
+ * If we have an iv can set it directly, otherwise use saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv) {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ } else {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+}
+
+# define aes_t4_gcm_cipher aes_gcm_cipher
+static int aes_t4_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aes_t4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key) {
+ int bits = ctx->key_len * 4;
+ xctx->stream = NULL;
+ /* key_len is two AES keys */
+ if (enc) {
+ aes_t4_set_encrypt_key(key, bits, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) aes_t4_encrypt;
+ switch (bits) {
+ case 128:
+ xctx->stream = aes128_t4_xts_encrypt;
+ break;
+# if 0 /* not yet */
+ case 192:
+ xctx->stream = aes192_t4_xts_encrypt;
+ break;
+# endif
+ case 256:
+ xctx->stream = aes256_t4_xts_encrypt;
+ break;
+ default:
+ return 0;
+ }
+ } else {
+ aes_t4_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) aes_t4_decrypt;
+ switch (bits) {
+ case 128:
+ xctx->stream = aes128_t4_xts_decrypt;
+ break;
+# if 0 /* not yet */
+ case 192:
+ xctx->stream = aes192_t4_xts_decrypt;
+ break;
+# endif
+ case 256:
+ xctx->stream = aes256_t4_xts_decrypt;
+ break;
+ default:
+ return 0;
+ }
+ }
+
+ aes_t4_set_encrypt_key(key + ctx->key_len / 2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) aes_t4_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ }
+
+ if (iv) {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+}
+
+# define aes_t4_xts_cipher aes_xts_cipher
+static int aes_t4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aes_t4_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ int bits = ctx->key_len * 8;
+ aes_t4_set_encrypt_key(key, bits, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f) aes_t4_encrypt);
+# if 0 /* not yet */
+ switch (bits) {
+ case 128:
+ cctx->str = enc ? (ccm128_f) aes128_t4_ccm64_encrypt :
+ (ccm128_f) ae128_t4_ccm64_decrypt;
+ break;
+ case 192:
+ cctx->str = enc ? (ccm128_f) aes192_t4_ccm64_encrypt :
+ (ccm128_f) ae192_t4_ccm64_decrypt;
+ break;
+ case 256:
+ cctx->str = enc ? (ccm128_f) aes256_t4_ccm64_encrypt :
+ (ccm128_f) ae256_t4_ccm64_decrypt;
+ break;
+ default:
+ return 0;
+ }
+# else
+ cctx->str = NULL;
+# endif
+ cctx->key_set = 1;
+ }
+ if (iv) {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+}
+
+# define aes_t4_ccm_cipher aes_ccm_cipher
+static int aes_t4_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_t4_init_key, \
+ aes_t4_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize, \
+ keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_t4_##mode##_init_key, \
+ aes_t4_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# else
+
+# define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+# define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+# endif
+
+# if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+# include "arm_arch.h"
+# if __ARM_MAX_ARCH__>=7
+# if defined(BSAES_ASM)
+# define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+# endif
+# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
+# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
+# define HWAES_encrypt aes_v8_encrypt
+# define HWAES_decrypt aes_v8_decrypt
+# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
+# endif
+# endif
+
+# if defined(HWAES_CAPABLE)
+int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+void HWAES_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void HWAES_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char *ivec, const int enc);
+void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ const unsigned char ivec[16]);
+# endif
+
+# define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
+
+static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ int ret, mode;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc)
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ ret = HWAES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) HWAES_decrypt;
+ dat->stream.cbc = NULL;
+# ifdef HWAES_cbc_encrypt
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) HWAES_cbc_encrypt;
+# endif
+ } else
+# endif
+# ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode == EVP_CIPH_CBC_MODE) {
+ ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) AES_decrypt;
+ dat->stream.cbc = (cbc128_f) bsaes_cbc_encrypt;
+ } else
+# endif
+# ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE) {
+ ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) vpaes_decrypt;
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) vpaes_cbc_encrypt : NULL;
+ } else
+# endif
+ {
+ ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) AES_decrypt;
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) AES_cbc_encrypt : NULL;
+ } else
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ ret = HWAES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) HWAES_encrypt;
+ dat->stream.cbc = NULL;
+# ifdef HWAES_cbc_encrypt
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) HWAES_cbc_encrypt;
+ else
+# endif
+# ifdef HWAES_ctr32_encrypt_blocks
+ if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) HWAES_ctr32_encrypt_blocks;
+ else
+# endif
+ (void)0; /* terminate potentially open 'else' */
+ } else
+# endif
+# ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode == EVP_CIPH_CTR_MODE) {
+ ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) AES_encrypt;
+ dat->stream.ctr = (ctr128_f) bsaes_ctr32_encrypt_blocks;
+ } else
+# endif
+# ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE) {
+ ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) vpaes_encrypt;
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) vpaes_cbc_encrypt : NULL;
+ } else
+# endif
+ {
+ ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+ dat->block = (block128_f) AES_encrypt;
+ dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+ (cbc128_f) AES_cbc_encrypt : NULL;
+# ifdef AES_CTR_ASM
+ if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) AES_ctr32_encrypt;
+# endif
+ }
+
+ if (ret < 0) {
+ EVPerr(EVP_F_AES_INIT_KEY, EVP_R_AES_KEY_SETUP_FAILED);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ if (len < bl)
+ return 1;
+
+ if (dat->stream.cbc)
+ (*dat->stream.cbc) (in, out, len, &dat->ks, ctx->iv, ctx->encrypt);
+ else if (ctx->encrypt)
+ CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
+ else
+ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
+
+ return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+ size_t i;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ if (len < bl)
+ return 1;
+
+ for (i = 0, len -= bl; i <= len; i += bl)
+ (*dat->block) (in + i, out + i, &dat->ks);
+
+ return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, dat->block);
+ return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ CRYPTO_cfb128_8_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ if (ctx->flags & EVP_CIPH_FLAG_LENGTH_BITS) {
+ CRYPTO_cfb128_1_encrypt(in, out, len, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ return 1;
+ }
+
+ while (len >= MAXBITCHUNK) {
+ CRYPTO_cfb128_1_encrypt(in, out, MAXBITCHUNK * 8, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+ len -= MAXBITCHUNK;
+ }
+ if (len)
+ CRYPTO_cfb128_1_encrypt(in, out, len * 8, &dat->ks,
+ ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+
+ return 1;
+}
+
+static int aes_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ unsigned int num = ctx->num;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+ if (dat->stream.ctr)
+ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
+ ctx->iv, ctx->buf, &num, dat->stream.ctr);
+ else
+ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
+ ctx->iv, ctx->buf, &num, dat->block);
+ ctx->num = (size_t)num;
+ return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes, 128, EVP_CIPH_FLAG_FIPS)
+ BLOCK_CIPHER_generic_pack(NID_aes, 192, EVP_CIPH_FLAG_FIPS)
+ BLOCK_CIPHER_generic_pack(NID_aes, 256, EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+{
+ EVP_AES_GCM_CTX *gctx = c->cipher_data;
+ OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+ if (gctx->iv != c->iv)
+ OPENSSL_free(gctx->iv);
+ return 1;
+}
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter)
+{
+ int n = 8;
+ unsigned char c;
+
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c)
+ return;
+ } while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+ EVP_AES_GCM_CTX *gctx = c->cipher_data;
+ switch (type) {
+ case EVP_CTRL_INIT:
+ gctx->key_set = 0;
+ gctx->iv_set = 0;
+ gctx->ivlen = c->cipher->iv_len;
+ gctx->iv = c->iv;
+ gctx->taglen = -1;
+ gctx->iv_gen = 0;
+ gctx->tls_aad_len = -1;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IVLEN:
+ if (arg <= 0)
+ return 0;
+ /* Allocate memory for IV if needed */
+ if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) {
+ if (gctx->iv != c->iv)
+ OPENSSL_free(gctx->iv);
+ gctx->iv = OPENSSL_malloc(arg);
+ if (!gctx->iv)
+ return 0;
+ }
+ gctx->ivlen = arg;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_TAG:
+ if (arg <= 0 || arg > 16 || c->encrypt)
+ return 0;
+ memcpy(c->buf, ptr, arg);
+ gctx->taglen = arg;
+ return 1;
+
+ case EVP_CTRL_GCM_GET_TAG:
+ if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+ return 0;
+ memcpy(ptr, c->buf, arg);
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IV_FIXED:
+ /* Special case: -1 length restores whole IV */
+ if (arg == -1) {
+ memcpy(gctx->iv, ptr, gctx->ivlen);
+ gctx->iv_gen = 1;
+ return 1;
+ }
+ /*
+ * Fixed field must be at least 4 bytes and invocation field at least
+ * 8.
+ */
+ if ((arg < 4) || (gctx->ivlen - arg) < 8)
+ return 0;
+ if (arg)
+ memcpy(gctx->iv, ptr, arg);
+ if (c->encrypt && RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+ return 0;
+ gctx->iv_gen = 1;
+ return 1;
+
+ case EVP_CTRL_GCM_IV_GEN:
+ if (gctx->iv_gen == 0 || gctx->key_set == 0)
+ return 0;
+ CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+ if (arg <= 0 || arg > gctx->ivlen)
+ arg = gctx->ivlen;
+ memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+ /*
+ * Invocation field will be at least 8 bytes in size and so no need
+ * to check wrap around or increment more than last 8 bytes.
+ */
+ ctr64_inc(gctx->iv + gctx->ivlen - 8);
+ gctx->iv_set = 1;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IV_INV:
+ if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+ return 0;
+ memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+ CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ return 1;
+
+ case EVP_CTRL_AEAD_TLS1_AAD:
+ /* Save the AAD for later use */
+ if (arg != 13)
+ return 0;
+ memcpy(c->buf, ptr, arg);
+ gctx->tls_aad_len = arg;
+ {
+ unsigned int len = c->buf[arg - 2] << 8 | c->buf[arg - 1];
+ /* Correct length for explicit IV */
+ len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ /* If decrypting correct for tag too */
+ if (!c->encrypt)
+ len -= EVP_GCM_TLS_TAG_LEN;
+ c->buf[arg - 2] = len >> 8;
+ c->buf[arg - 1] = len & 0xff;
+ }
+ /* Extra padding: tag appended to record */
+ return EVP_GCM_TLS_TAG_LEN;
+
+ case EVP_CTRL_COPY:
+ {
+ EVP_CIPHER_CTX *out = ptr;
+ EVP_AES_GCM_CTX *gctx_out = out->cipher_data;
+ if (gctx->gcm.key) {
+ if (gctx->gcm.key != &gctx->ks)
+ return 0;
+ gctx_out->gcm.key = &gctx_out->ks;
+ }
+ if (gctx->iv == c->iv)
+ gctx_out->iv = out->iv;
+ else {
+ gctx_out->iv = OPENSSL_malloc(gctx->ivlen);
+ if (!gctx_out->iv)
+ return 0;
+ memcpy(gctx_out->iv, gctx->iv, gctx->ivlen);
+ }
+ return 1;
+ }
+
+ default:
+ return -1;
+
+ }
+}
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) {
+ do {
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ HWAES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f) HWAES_encrypt);
+# ifdef HWAES_ctr32_encrypt_blocks
+ gctx->ctr = (ctr128_f) HWAES_ctr32_encrypt_blocks;
+# else
+ gctx->ctr = NULL;
+# endif
+ break;
+ } else
+# endif
+# ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE) {
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f) AES_encrypt);
+ gctx->ctr = (ctr128_f) bsaes_ctr32_encrypt_blocks;
+ break;
+ } else
+# endif
+# ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE) {
+ vpaes_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f) vpaes_encrypt);
+ gctx->ctr = NULL;
+ break;
+ } else
+# endif
+ (void)0; /* terminate potentially open 'else' */
+
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f) AES_encrypt);
+# ifdef AES_CTR_ASM
+ gctx->ctr = (ctr128_f) AES_ctr32_encrypt;
+# else
+ gctx->ctr = NULL;
+# endif
+ } while (0);
+
+ /*
+ * If we have an iv can set it directly, otherwise use saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv) {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ } else {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+}
+
+/*
+ * Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ int rv = -1;
+ /* Encrypt/decrypt must be performed in place */
+ if (out != in
+ || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN))
+ return -1;
+ /*
+ * Set IV from start of buffer or generate IV and write to start of
+ * buffer.
+ */
+ if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+ EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+ EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+ goto err;
+ /* Use saved AAD */
+ if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+ goto err;
+ /* Fix buffer and length to point to payload */
+ in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+ if (ctx->encrypt) {
+ /* Encrypt payload */
+ if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 32 && AES_GCM_ASM(gctx)) {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
+ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
+ goto err;
+ } else {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 32 && AES_GCM_ASM2(gctx)) {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
+ goto err;
+ }
+ out += len;
+ /* Finally write tag */
+ CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+ rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+ } else {
+ /* Decrypt */
+ if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 16 && AES_GCM_ASM(gctx)) {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
+ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
+ goto err;
+ } else {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 16 && AES_GCM_ASM2(gctx)) {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in, out, len,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ }
+# endif
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
+ goto err;
+ }
+ /* Retrieve tag */
+ CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, EVP_GCM_TLS_TAG_LEN);
+ /* If tag mismatch wipe buffer */
+ if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN)) {
+ OPENSSL_cleanse(out, len);
+ goto err;
+ }
+ rv = len;
+ }
+
+ err:
+ gctx->iv_set = 0;
+ gctx->tls_aad_len = -1;
+ return rv;
+}
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ /* If not set up, return error */
+ if (!gctx->key_set)
+ return -1;
+
+ if (gctx->tls_aad_len >= 0)
+ return aes_gcm_tls_cipher(ctx, out, in, len);
+
+ if (!gctx->iv_set)
+ return -1;
+ if (in) {
+ if (out == NULL) {
+ if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+ return -1;
+ } else if (ctx->encrypt) {
+ if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 32 && AES_GCM_ASM(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key, gctx->gcm.Yi.c,
+ gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
+ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
+ return -1;
+ } else {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 32 && AES_GCM_ASM2(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_encrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key, gctx->gcm.Yi.c,
+ gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
+ return -1;
+ }
+ } else {
+ if (gctx->ctr) {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+ if (len >= 16 && AES_GCM_ASM(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
+ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+ in + bulk,
+ out + bulk,
+ len - bulk, gctx->ctr))
+ return -1;
+ } else {
+ size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+ if (len >= 16 && AES_GCM_ASM2(gctx)) {
+ size_t res = (16 - gctx->gcm.mres) % 16;
+
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+ return -1;
+
+ bulk = AES_gcm_decrypt(in + res,
+ out + res, len - res,
+ gctx->gcm.key,
+ gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+ gctx->gcm.len.u[1] += bulk;
+ bulk += res;
+ }
+# endif
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm,
+ in + bulk, out + bulk, len - bulk))
+ return -1;
+ }
+ }
+ return len;
+ } else {
+ if (!ctx->encrypt) {
+ if (gctx->taglen < 0)
+ return -1;
+ if (CRYPTO_gcm128_finish(&gctx->gcm, ctx->buf, gctx->taglen) != 0)
+ return -1;
+ gctx->iv_set = 0;
+ return 0;
+ }
+ CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+ gctx->taglen = 16;
+ /* Don't reuse the IV */
+ gctx->iv_set = 0;
+ return 0;
+ }
+
+}
+
+# define CUSTOM_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 \
+ | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
+ | EVP_CIPH_CUSTOM_COPY)
+
+BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, gcm, GCM,
+ EVP_CIPH_FLAG_FIPS | EVP_CIPH_FLAG_AEAD_CIPHER |
+ CUSTOM_FLAGS)
+ BLOCK_CIPHER_custom(NID_aes, 192, 1, 12, gcm, GCM,
+ EVP_CIPH_FLAG_FIPS | EVP_CIPH_FLAG_AEAD_CIPHER |
+ CUSTOM_FLAGS)
+ BLOCK_CIPHER_custom(NID_aes, 256, 1, 12, gcm, GCM,
+ EVP_CIPH_FLAG_FIPS | EVP_CIPH_FLAG_AEAD_CIPHER |
+ CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+ EVP_AES_XTS_CTX *xctx = c->cipher_data;
+ if (type == EVP_CTRL_COPY) {
+ EVP_CIPHER_CTX *out = ptr;
+ EVP_AES_XTS_CTX *xctx_out = out->cipher_data;
+ if (xctx->xts.key1) {
+ if (xctx->xts.key1 != &xctx->ks1)
+ return 0;
+ xctx_out->xts.key1 = &xctx_out->ks1;
+ }
+ if (xctx->xts.key2) {
+ if (xctx->xts.key2 != &xctx->ks2)
+ return 0;
+ xctx_out->xts.key2 = &xctx_out->ks2;
+ }
+ return 1;
+ } else if (type != EVP_CTRL_INIT)
+ return -1;
+ /* key1 and key2 are used as an indicator both key and IV are set */
+ xctx->xts.key1 = NULL;
+ xctx->xts.key2 = NULL;
+ return 1;
+}
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key)
+ do {
+# ifdef AES_XTS_ASM
+ xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+# else
+ xctx->stream = NULL;
+# endif
+ /* key_len is two AES keys */
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ if (enc) {
+ HWAES_set_encrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) HWAES_encrypt;
+ } else {
+ HWAES_set_decrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) HWAES_decrypt;
+ }
+
+ HWAES_set_encrypt_key(key + ctx->key_len / 2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) HWAES_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ break;
+ } else
+# endif
+# ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE)
+ xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+ else
+# endif
+# ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE) {
+ if (enc) {
+ vpaes_set_encrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) vpaes_encrypt;
+ } else {
+ vpaes_set_decrypt_key(key, ctx->key_len * 4,
+ &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) vpaes_decrypt;
+ }
+
+ vpaes_set_encrypt_key(key + ctx->key_len / 2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) vpaes_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ break;
+ } else
+# endif
+ (void)0; /* terminate potentially open 'else' */
+
+ if (enc) {
+ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) AES_encrypt;
+ } else {
+ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f) AES_decrypt;
+ }
+
+ AES_set_encrypt_key(key + ctx->key_len / 2,
+ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f) AES_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ } while (0);
+
+ if (iv) {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+}
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!xctx->xts.key1 || !xctx->xts.key2)
+ return 0;
+ if (!out || !in || len < AES_BLOCK_SIZE)
+ return 0;
+ if (xctx->stream)
+ (*xctx->stream) (in, out, len,
+ xctx->xts.key1, xctx->xts.key2, ctx->iv);
+ else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+ ctx->encrypt))
+ return 0;
+ return 1;
+}
+
+# define aes_xts_cleanup NULL
+
+# define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
+ | EVP_CIPH_CUSTOM_COPY)
+
+BLOCK_CIPHER_custom(NID_aes, 128, 1, 16, xts, XTS,
+ EVP_CIPH_FLAG_FIPS | XTS_FLAGS)
+ BLOCK_CIPHER_custom(NID_aes, 256, 1, 16, xts, XTS,
+ EVP_CIPH_FLAG_FIPS | XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+ EVP_AES_CCM_CTX *cctx = c->cipher_data;
+ switch (type) {
+ case EVP_CTRL_INIT:
+ cctx->key_set = 0;
+ cctx->iv_set = 0;
+ cctx->L = 8;
+ cctx->M = 12;
+ cctx->tag_set = 0;
+ cctx->len_set = 0;
+ return 1;
+
+ case EVP_CTRL_CCM_SET_IVLEN:
+ arg = 15 - arg;
+ case EVP_CTRL_CCM_SET_L:
+ if (arg < 2 || arg > 8)
+ return 0;
+ cctx->L = arg;
+ return 1;
+
+ case EVP_CTRL_CCM_SET_TAG:
+ if ((arg & 1) || arg < 4 || arg > 16)
+ return 0;
+ if (c->encrypt && ptr)
+ return 0;
+ if (ptr) {
+ cctx->tag_set = 1;
+ memcpy(c->buf, ptr, arg);
+ }
+ cctx->M = arg;
+ return 1;
+
+ case EVP_CTRL_CCM_GET_TAG:
+ if (!c->encrypt || !cctx->tag_set)
+ return 0;
+ if (!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+ return 0;
+ cctx->tag_set = 0;
+ cctx->iv_set = 0;
+ cctx->len_set = 0;
+ return 1;
+
+ case EVP_CTRL_COPY:
+ {
+ EVP_CIPHER_CTX *out = ptr;
+ EVP_AES_CCM_CTX *cctx_out = out->cipher_data;
+ if (cctx->ccm.key) {
+ if (cctx->ccm.key != &cctx->ks)
+ return 0;
+ cctx_out->ccm.key = &cctx_out->ks;
+ }
+ return 1;
+ }
+
+ default:
+ return -1;
+
+ }
+}
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key)
+ do {
+# ifdef HWAES_CAPABLE
+ if (HWAES_CAPABLE) {
+ HWAES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f) HWAES_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ break;
+ } else
+# endif
+# ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE) {
+ vpaes_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f) vpaes_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ break;
+ }
+# endif
+ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f) AES_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ } while (0);
+ if (iv) {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+}
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ CCM128_CONTEXT *ccm = &cctx->ccm;
+ /* If not set up, return error */
+ if (!cctx->iv_set && !cctx->key_set)
+ return -1;
+ if (!ctx->encrypt && !cctx->tag_set)
+ return -1;
+ if (!out) {
+ if (!in) {
+ if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+ return -1;
+ cctx->len_set = 1;
+ return len;
+ }
+ /* If have AAD need message length */
+ if (!cctx->len_set && len)
+ return -1;
+ CRYPTO_ccm128_aad(ccm, in, len);
+ return len;
+ }
+ /* EVP_*Final() doesn't return any data */
+ if (!in)
+ return 0;
+ /* If not set length yet do it */
+ if (!cctx->len_set) {
+ if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+ return -1;
+ cctx->len_set = 1;
+ }
+ if (ctx->encrypt) {
+ if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+ cctx->str) :
+ CRYPTO_ccm128_encrypt(ccm, in, out, len))
+ return -1;
+ cctx->tag_set = 1;
+ return len;
+ } else {
+ int rv = -1;
+ if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+ cctx->str) :
+ !CRYPTO_ccm128_decrypt(ccm, in, out, len)) {
+ unsigned char tag[16];
+ if (CRYPTO_ccm128_tag(ccm, tag, cctx->M)) {
+ if (!memcmp(tag, ctx->buf, cctx->M))
+ rv = len;
+ }
+ }
+ if (rv == -1)
+ OPENSSL_cleanse(out, len);
+ cctx->iv_set = 0;
+ cctx->tag_set = 0;
+ cctx->len_set = 0;
+ return rv;
+ }
+
+}
+
+# define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, ccm, CCM,
+ EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
+ BLOCK_CIPHER_custom(NID_aes, 192, 1, 12, ccm, CCM,
+ EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
+ BLOCK_CIPHER_custom(NID_aes, 256, 1, 12, ccm, CCM,
+ EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
+#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/e_des3.c Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,381 @@
+/* crypto/evp/e_des3.c */
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young ([email protected])"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson ([email protected])"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#ifndef OPENSSL_NO_DES
+# include <openssl/evp.h>
+# include <openssl/objects.h>
+# include "evp_locl.h"
+# include <openssl/des.h>
+# include <openssl/rand.h>
+
+typedef struct {
+ union {
+ double align;
+ DES_key_schedule ks[3];
+ } ks;
+ union {
+ void (*cbc) (const void *, void *, size_t,
+ const DES_key_schedule *, unsigned char *);
+ } stream;
+} DES_EDE_KEY;
+# define ks1 ks.ks[0]
+# define ks2 ks.ks[1]
+# define ks3 ks.ks[2]
+
+# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+/* ---------^^^ this is not a typo, just a way to detect that
+ * assembler support was in general requested... */
+# include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+# define SPARC_DES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_DES)
+
+void des_t4_key_expand(const void *key, DES_key_schedule *ks);
+void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len,
+ const DES_key_schedule ks[3], unsigned char iv[8]);
+void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len,
+ const DES_key_schedule ks[3], unsigned char iv[8]);
+# endif
+
+static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc);
+
+static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc);
+
+static int des3_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr);
+
+# define data(ctx) ((DES_EDE_KEY *)(ctx)->cipher_data)
+
+/*
+ * Because of various casts and different args can't use
+ * IMPLEMENT_BLOCK_CIPHER
+ */
+
+static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ BLOCK_CIPHER_ecb_loop()
+ DES_ecb3_encrypt((const_DES_cblock *)(in + i),
+ (DES_cblock *)(out + i),
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, ctx->encrypt);
+ return 1;
+}
+
+static int des_ede_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ while (inl >= EVP_MAXCHUNK) {
+ DES_ede3_ofb64_encrypt(in, out, (long)EVP_MAXCHUNK,
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+ &ctx->num);
+ inl -= EVP_MAXCHUNK;
+ in += EVP_MAXCHUNK;
+ out += EVP_MAXCHUNK;
+ }
+ if (inl)
+ DES_ede3_ofb64_encrypt(in, out, (long)inl,
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+ &ctx->num);
+
+ return 1;
+}
+
+static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ DES_EDE_KEY *dat = data(ctx);
+
+# ifdef KSSL_DEBUG
+ {
+ int i;
+ fprintf(stderr, "des_ede_cbc_cipher(ctx=%p, buflen=%d)\n", ctx,
+ ctx->buf_len);
+ fprintf(stderr, "\t iv= ");
+ for (i = 0; i < 8; i++)
+ fprintf(stderr, "%02X", ctx->iv[i]);
+ fprintf(stderr, "\n");
+ }
+# endif /* KSSL_DEBUG */
+ if (dat->stream.cbc) {
+ (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv);
+ return 1;
+ }
+
+ while (inl >= EVP_MAXCHUNK) {
+ DES_ede3_cbc_encrypt(in, out, (long)EVP_MAXCHUNK,
+ &dat->ks1, &dat->ks2, &dat->ks3,
+ (DES_cblock *)ctx->iv, ctx->encrypt);
+ inl -= EVP_MAXCHUNK;
+ in += EVP_MAXCHUNK;
+ out += EVP_MAXCHUNK;
+ }
+ if (inl)
+ DES_ede3_cbc_encrypt(in, out, (long)inl,
+ &dat->ks1, &dat->ks2, &dat->ks3,
+ (DES_cblock *)ctx->iv, ctx->encrypt);
+ return 1;
+}
+
+static int des_ede_cfb64_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ while (inl >= EVP_MAXCHUNK) {
+ DES_ede3_cfb64_encrypt(in, out, (long)EVP_MAXCHUNK,
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+ &ctx->num, ctx->encrypt);
+ inl -= EVP_MAXCHUNK;
+ in += EVP_MAXCHUNK;
+ out += EVP_MAXCHUNK;
+ }
+ if (inl)
+ DES_ede3_cfb64_encrypt(in, out, (long)inl,
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+ &ctx->num, ctx->encrypt);
+ return 1;
+}
+
+/*
+ * Although we have a CFB-r implementation for 3-DES, it doesn't pack the
+ * right way, so wrap it here
+ */
+static int des_ede3_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ size_t n;
+ unsigned char c[1], d[1];
+
+ for (n = 0; n < inl; ++n) {
+ c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+ DES_ede3_cfb_encrypt(c, d, 1, 1,
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+ ctx->encrypt);
+ out[n / 8] = (out[n / 8] & ~(0x80 >> (unsigned int)(n % 8)))
+ | ((d[0] & 0x80) >> (unsigned int)(n % 8));
+ }
+
+ return 1;
+}
+
+static int des_ede3_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t inl)
+{
+ while (inl >= EVP_MAXCHUNK) {
+ DES_ede3_cfb_encrypt(in, out, 8, (long)EVP_MAXCHUNK,
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+ ctx->encrypt);
+ inl -= EVP_MAXCHUNK;
+ in += EVP_MAXCHUNK;
+ out += EVP_MAXCHUNK;
+ }
+ if (inl)
+ DES_ede3_cfb_encrypt(in, out, 8, (long)inl,
+ &data(ctx)->ks1, &data(ctx)->ks2,
+ &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+ ctx->encrypt);
+ return 1;
+}
+
+BLOCK_CIPHER_defs(des_ede, DES_EDE_KEY, NID_des_ede, 8, 16, 8, 64,
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_DEFAULT_ASN1,
+ des_ede_init_key, NULL, NULL, NULL, des3_ctrl)
+# define des_ede3_cfb64_cipher des_ede_cfb64_cipher
+# define des_ede3_ofb_cipher des_ede_ofb_cipher
+# define des_ede3_cbc_cipher des_ede_cbc_cipher
+# define des_ede3_ecb_cipher des_ede_ecb_cipher
+ BLOCK_CIPHER_defs(des_ede3, DES_EDE_KEY, NID_des_ede3, 8, 24, 8, 64,
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+ EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL, NULL,
+ des3_ctrl)
+
+ BLOCK_CIPHER_def_cfb(des_ede3, DES_EDE_KEY, NID_des_ede3, 24, 8, 1,
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+ EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL,
+ NULL, des3_ctrl)
+
+ BLOCK_CIPHER_def_cfb(des_ede3, DES_EDE_KEY, NID_des_ede3, 24, 8, 8,
+ EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+ EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL,
+ NULL, des3_ctrl)
+
+static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ DES_cblock *deskey = (DES_cblock *)key;
+ DES_EDE_KEY *dat = data(ctx);
+
+ dat->stream.cbc = NULL;
+# if defined(SPARC_DES_CAPABLE)
+ if (SPARC_DES_CAPABLE) {
+ int mode = ctx->cipher->flags & EVP_CIPH_MODE;
+
+ if (mode == EVP_CIPH_CBC_MODE) {
+ des_t4_key_expand(&deskey[0], &dat->ks1);
+ des_t4_key_expand(&deskey[1], &dat->ks2);
+ memcpy(&dat->ks3, &dat->ks1, sizeof(dat->ks1));
+ dat->stream.cbc = enc ? des_t4_ede3_cbc_encrypt :
+ des_t4_ede3_cbc_decrypt;
+ return 1;
+ }
+ }
+# endif
+# ifdef EVP_CHECK_DES_KEY
+ if (DES_set_key_checked(&deskey[0], &dat->ks1)
+ || DES_set_key_checked(&deskey[1], &dat->ks2))
+ return 0;
+# else
+ DES_set_key_unchecked(&deskey[0], &dat->ks1);
+ DES_set_key_unchecked(&deskey[1], &dat->ks2);
+# endif
+ memcpy(&dat->ks3, &dat->ks1, sizeof(dat->ks1));
+ return 1;
+}
+
+static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+{
+ DES_cblock *deskey = (DES_cblock *)key;
+ DES_EDE_KEY *dat = data(ctx);
+
+# ifdef KSSL_DEBUG
+ {
+ int i;
+ fprintf(stderr, "des_ede3_init_key(ctx=%p)\n", ctx);
+ fprintf(stderr, "\tKEY= ");
+ for (i = 0; i < 24; i++)
+ fprintf(stderr, "%02X", key[i]);
+ fprintf(stderr, "\n");
+ if (iv) {
+ fprintf(stderr, "\t IV= ");
+ for (i = 0; i < 8; i++)
+ fprintf(stderr, "%02X", iv[i]);
+ fprintf(stderr, "\n");
+ }
+ }
+# endif /* KSSL_DEBUG */
+
+ dat->stream.cbc = NULL;
+# if defined(SPARC_DES_CAPABLE)
+ if (SPARC_DES_CAPABLE) {
+ int mode = ctx->cipher->flags & EVP_CIPH_MODE;
+
+ if (mode == EVP_CIPH_CBC_MODE) {
+ des_t4_key_expand(&deskey[0], &dat->ks1);
+ des_t4_key_expand(&deskey[1], &dat->ks2);
+ des_t4_key_expand(&deskey[2], &dat->ks3);
+ dat->stream.cbc = enc ? des_t4_ede3_cbc_encrypt :
+ des_t4_ede3_cbc_decrypt;
+ return 1;
+ }
+ }
+# endif
+# ifdef EVP_CHECK_DES_KEY
+ if (DES_set_key_checked(&deskey[0], &dat->ks1)
+ || DES_set_key_checked(&deskey[1], &dat->ks2)
+ || DES_set_key_checked(&deskey[2], &dat->ks3))
+ return 0;
+# else
+ DES_set_key_unchecked(&deskey[0], &dat->ks1);
+ DES_set_key_unchecked(&deskey[1], &dat->ks2);
+ DES_set_key_unchecked(&deskey[2], &dat->ks3);
+# endif
+ return 1;
+}
+
+static int des3_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+
+ DES_cblock *deskey = ptr;
+
+ switch (type) {
+ case EVP_CTRL_RAND_KEY:
+ if (RAND_bytes(ptr, c->key_len) <= 0)
+ return 0;
+ DES_set_odd_parity(deskey);
+ if (c->key_len >= 16)
+ DES_set_odd_parity(deskey + 1);
+ if (c->key_len >= 24)
+ DES_set_odd_parity(deskey + 2);
+ return 1;
+
+ default:
+ return -1;
+ }
+}
+
+const EVP_CIPHER *EVP_des_ede(void)
+{
+ return &des_ede_ecb;
+}
+
+const EVP_CIPHER *EVP_des_ede3(void)
+{
+ return &des_ede3_ecb;
+}
+
+#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sha1-sparcv9.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,428 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <[email protected]>.
+# ====================================================================
+
+# Performance improvement is not really impressive on pre-T1 CPU: +8%
+# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
+# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
+# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
+# X[16] vector is packed to 8 64-bit registers and as result nothing
+# is spilled on stack. In addition input data is loaded in compact
+# instruction sequence, thus minimizing the window when the code is
+# subject to [inter-thread] cache-thrashing hazard. The goal is to
+# ensure scalability on UltraSPARC T1, or rather to avoid decay when
+# amount of active threads exceeds the number of physical cores.
+
+# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
+# faster than software. Multi-process benchmark saturates at 11x
+# single-process result on 8-core processor, or ~9GBps per 2.85GHz
+# socket.
+
+$output=shift;
+open STDOUT,">$output";
+
+@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
+$rot1m="%g2";
+$tmp64="%g3";
+$Xi="%g4";
+$A="%l0";
+$B="%l1";
+$C="%l2";
+$D="%l3";
+$E="%l4";
+@V=($A,$B,$C,$D,$E);
+$K_00_19="%l5";
+$K_20_39="%l6";
+$K_40_59="%l7";
+$K_60_79="%g5";
+@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
+
+$ctx="%i0";
+$inp="%i1";
+$len="%i2";
+$tmp0="%i3";
+$tmp1="%i4";
+$tmp2="%i5";
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi=($i&1)?@X[($i/2)%8]:$Xi;
+
+$code.=<<___;
+ sll $a,5,$tmp0 !! $i
+ add @K[$i/20],$e,$e
+ srl $a,27,$tmp1
+ add $tmp0,$e,$e
+ and $c,$b,$tmp0
+ add $tmp1,$e,$e
+ sll $b,30,$tmp2
+ andn $d,$b,$tmp1
+ srl $b,2,$b
+ or $tmp1,$tmp0,$tmp1
+ or $tmp2,$b,$b
+ add $xi,$e,$e
+___
+if ($i&1 && $i<15) {
+ $code.=
+ " srlx @X[(($i+1)/2)%8],32,$Xi\n";
+}
+$code.=<<___;
+ add $tmp1,$e,$e
+___
+}
+
+sub Xupdate {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i/2;
+
+if ($i&1) {
+$code.=<<___;
+ sll $a,5,$tmp0 !! $i
+ add @K[$i/20],$e,$e
+ srl $a,27,$tmp1
+___
+} else {
+$code.=<<___;
+ sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
+ xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
+ srlx @X[($j+7)%8],32,$tmp1
+ xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
+ sll $a,5,$tmp0 !! $i
+ or $tmp1,$Xi,$Xi
+ add @K[$i/20],$e,$e !!
+ xor $Xi,@X[$j%8],@X[$j%8]
+ srlx @X[$j%8],31,$Xi
+ add @X[$j%8],@X[$j%8],@X[$j%8]
+ and $Xi,$rot1m,$Xi
+ andn @X[$j%8],$rot1m,@X[$j%8]
+ srl $a,27,$tmp1 !!
+ or $Xi,@X[$j%8],@X[$j%8]
+___
+}
+}
+
+sub BODY_16_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+
+ &Xupdate(@_);
+ if ($i&1) {
+ $xi=@X[($i/2)%8];
+ } else {
+ $xi=$Xi;
+ $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
+ }
+$code.=<<___;
+ add $tmp0,$e,$e !!
+ and $c,$b,$tmp0
+ add $tmp1,$e,$e
+ sll $b,30,$tmp2
+ add $xi,$e,$e
+ andn $d,$b,$tmp1
+ srl $b,2,$b
+ or $tmp1,$tmp0,$tmp1
+ or $tmp2,$b,$b
+ add $tmp1,$e,$e
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi;
+ &Xupdate(@_);
+ if ($i&1) {
+ $xi=@X[($i/2)%8];
+ } else {
+ $xi=$Xi;
+ $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
+ }
+$code.=<<___;
+ add $tmp0,$e,$e !!
+ xor $c,$b,$tmp0
+ add $tmp1,$e,$e
+ sll $b,30,$tmp2
+ xor $d,$tmp0,$tmp1
+ srl $b,2,$b
+ add $tmp1,$e,$e
+ or $tmp2,$b,$b
+ add $xi,$e,$e
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi;
+ &Xupdate(@_);
+ if ($i&1) {
+ $xi=@X[($i/2)%8];
+ } else {
+ $xi=$Xi;
+ $code.="\tsrlx @X[($i/2)%8],32,$xi\n";
+ }
+$code.=<<___;
+ add $tmp0,$e,$e !!
+ and $c,$b,$tmp0
+ add $tmp1,$e,$e
+ sll $b,30,$tmp2
+ or $c,$b,$tmp1
+ srl $b,2,$b
+ and $d,$tmp1,$tmp1
+ add $xi,$e,$e
+ or $tmp1,$tmp0,$tmp1
+ or $tmp2,$b,$b
+ add $tmp1,$e,$e
+___
+}
+
+$code.=<<___;
+#include "sparc_arch.h"
+#include <openssl/fipssyms.h>
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+.section ".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.align 32
+.globl sha1_block_data_order
+sha1_block_data_order:
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
+
+ andcc %g1, CFR_SHA1, %g0
+ be .Lsoftware
+ nop
+
+ ld [%o0 + 0x00], %f0 ! load context
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x0c], %f3
+ bne,pn %icc, .Lhwunaligned
+ ld [%o0 + 0x10], %f4
+
+.Lhw_loop:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x38], %f22
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ .word 0x81b02820 ! SHA1
+
+ bne,pt SIZE_T_CC, .Lhw_loop
+ nop
+
+.Lhwfinish:
+ st %f0, [%o0 + 0x00] ! store context
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ retl
+ st %f4, [%o0 + 0x10]
+
+.align 8
+.Lhwunaligned:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x40], %f26
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ .word 0x81b02820 ! SHA1
+
+ bne,pt SIZE_T_CC, .Lhwunaligned_loop
+ for %f26, %f26, %f10 ! %f10=%f26
+
+ ba .Lhwfinish
+ nop
+
+.align 16
+.Lsoftware:
+ save %sp,-STACK_FRAME,%sp
+ sllx $len,6,$len
+ add $inp,$len,$len
+
+ or %g0,1,$rot1m
+ sllx $rot1m,32,$rot1m
+ or $rot1m,1,$rot1m
+
+ ld [$ctx+0],$A
+ ld [$ctx+4],$B
+ ld [$ctx+8],$C
+ ld [$ctx+12],$D
+ ld [$ctx+16],$E
+ andn $inp,7,$tmp0
+
+ sethi %hi(0x5a827999),$K_00_19
+ or $K_00_19,%lo(0x5a827999),$K_00_19
+ sethi %hi(0x6ed9eba1),$K_20_39
+ or $K_20_39,%lo(0x6ed9eba1),$K_20_39
+ sethi %hi(0x8f1bbcdc),$K_40_59
+ or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
+ sethi %hi(0xca62c1d6),$K_60_79
+ or $K_60_79,%lo(0xca62c1d6),$K_60_79
+
+.Lloop:
+ ldx [$tmp0+0],@X[0]
+ ldx [$tmp0+16],@X[2]
+ ldx [$tmp0+32],@X[4]
+ ldx [$tmp0+48],@X[6]
+ and $inp,7,$tmp1
+ ldx [$tmp0+8],@X[1]
+ sll $tmp1,3,$tmp1
+ ldx [$tmp0+24],@X[3]
+ subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
+ ldx [$tmp0+40],@X[5]
+ bz,pt %icc,.Laligned
+ ldx [$tmp0+56],@X[7]
+
+ sllx @X[0],$tmp1,@X[0]
+ ldx [$tmp0+64],$tmp64
+___
+for($i=0;$i<7;$i++)
+{ $code.=<<___;
+ srlx @X[$i+1],$tmp2,$Xi
+ sllx @X[$i+1],$tmp1,@X[$i+1]
+ or $Xi,@X[$i],@X[$i]
+___
+}
+$code.=<<___;
+ srlx $tmp64,$tmp2,$tmp64
+ or $tmp64,@X[7],@X[7]
+.Laligned:
+ srlx @X[0],32,$Xi
+___
+for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+
+ ld [$ctx+0],@X[0]
+ ld [$ctx+4],@X[1]
+ ld [$ctx+8],@X[2]
+ ld [$ctx+12],@X[3]
+ add $inp,64,$inp
+ ld [$ctx+16],@X[4]
+ cmp $inp,$len
+
+ add $A,@X[0],$A
+ st $A,[$ctx+0]
+ add $B,@X[1],$B
+ st $B,[$ctx+4]
+ add $C,@X[2],$C
+ st $C,[$ctx+8]
+ add $D,@X[3],$D
+ st $D,[$ctx+12]
+ add $E,@X[4],$E
+ st $E,[$ctx+16]
+
+ bne SIZE_T_CC,.Lloop
+ andn $inp,7,$tmp0
+
+ ret
+ restore
+.type sha1_block_data_order,#function
+.size sha1_block_data_order,(.-sha1_block_data_order)
+.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = ( "faligndata" => 0x048,
+ "for" => 0x07c );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+ foreach ($rs1,$rs2,$rd) {
+ if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
+ else { return $ref; }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00300|$rd<<25|$rs1<<14|$rs2,
+ $ref;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /ge;
+ s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unalignaddr($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sha512-sparcv9.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <[email protected]>.
+# ====================================================================
+
+# SHA256 performance improvement over compiler generated code varies
+# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
+# build]. Just like in SHA1 module I aim to ensure scalability on
+# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
+
+# SHA512 on pre-T1 UltraSPARC.
+#
+# Performance is >75% better than 64-bit code generated by Sun C and
+# over 2x than 32-bit code. X[16] resides on stack, but access to it
+# is scheduled for L2 latency and staged through 32 least significant
+# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
+# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
+# good [optimal coefficient is 50%].
+#
+# SHA512 on UltraSPARC T1.
+#
+# It's not any faster than 64-bit code generated by Sun C 5.8. This is
+# because 64-bit code generator has the advantage of using 64-bit
+# loads(*) to access X[16], which I consciously traded for 32-/64-bit
+# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
+# code by 60%, not to mention that it doesn't suffer from severe decay
+# when running 4 times physical cores threads and that it leaves gcc
+# [3.4] behind by over 4x factor! If compared to SHA256, single thread
+# performance is only 10% better, but overall throughput for maximum
+# amount of threads for given CPU exceeds corresponding one of SHA256
+# by 30% [again, optimal coefficient is 50%].
+#
+# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
+# in-order, i.e. load instruction has to complete prior next
+# instruction in given thread is executed, even if the latter is
+# not dependent on load result! This means that on T1 two 32-bit
+# loads are always slower than one 64-bit load. Once again this
+# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
+# 2x32-bit loads can be as fast as 1x64-bit ones.
+#
+# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
+# which is 9.3x/11.1x faster than software. Multi-process benchmark
+# saturates at 11.5x single-process result on 8-core processor, or
+# ~11/16GBps per 2.85GHz socket.
+
+$output=shift;
+open STDOUT,">$output";
+
+if ($output =~ /512/) {
+ $label="512";
+ $SZ=8;
+ $LD="ldx"; # load from memory
+ $ST="stx"; # store to memory
+ $SLL="sllx"; # shift left logical
+ $SRL="srlx"; # shift right logical
+ @Sigma0=(28,34,39);
+ @Sigma1=(14,18,41);
+ @sigma0=( 7, 1, 8); # right shift first
+ @sigma1=( 6,19,61); # right shift first
+ $lastK=0x817;
+ $rounds=80;
+ $align=4;
+
+ $locals=16*$SZ; # X[16]
+
+ $A="%o0";
+ $B="%o1";
+ $C="%o2";
+ $D="%o3";
+ $E="%o4";
+ $F="%o5";
+ $G="%g1";
+ $H="%o7";
+ @V=($A,$B,$C,$D,$E,$F,$G,$H);
+} else {
+ $label="256";
+ $SZ=4;
+ $LD="ld"; # load from memory
+ $ST="st"; # store to memory
+ $SLL="sll"; # shift left logical
+ $SRL="srl"; # shift right logical
+ @Sigma0=( 2,13,22);
+ @Sigma1=( 6,11,25);
+ @sigma0=( 3, 7,18); # right shift first
+ @sigma1=(10,17,19); # right shift first
+ $lastK=0x8f2;
+ $rounds=64;
+ $align=8;
+
+ $locals=0; # X[16] is register resident
+ @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
+
+ $A="%l0";
+ $B="%l1";
+ $C="%l2";
+ $D="%l3";
+ $E="%l4";
+ $F="%l5";
+ $G="%l6";
+ $H="%l7";
+ @V=($A,$B,$C,$D,$E,$F,$G,$H);
+}
+$T1="%g2";
+$tmp0="%g3";
+$tmp1="%g4";
+$tmp2="%g5";
+
+$ctx="%i0";
+$inp="%i1";
+$len="%i2";
+$Ktbl="%i3";
+$tmp31="%i4";
+$tmp32="%i5";
+
+########### SHA256
+$Xload = sub {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+
+ if ($i==0) {
+$code.=<<___;
+ ldx [$inp+0],@X[0]
+ ldx [$inp+16],@X[2]
+ ldx [$inp+32],@X[4]
+ ldx [$inp+48],@X[6]
+ ldx [$inp+8],@X[1]
+ ldx [$inp+24],@X[3]
+ subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
+ ldx [$inp+40],@X[5]
+ bz,pt %icc,.Laligned
+ ldx [$inp+56],@X[7]
+
+ sllx @X[0],$tmp31,@X[0]
+ ldx [$inp+64],$T1
+___
+for($j=0;$j<7;$j++)
+{ $code.=<<___;
+ srlx @X[$j+1],$tmp32,$tmp1
+ sllx @X[$j+1],$tmp31,@X[$j+1]
+ or $tmp1,@X[$j],@X[$j]
+___
+}
+$code.=<<___;
+ srlx $T1,$tmp32,$T1
+ or $T1,@X[7],@X[7]
+.Laligned:
+___
+ }
+
+ if ($i&1) {
+ $code.="\tadd @X[$i/2],$h,$T1\n";
+ } else {
+ $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
+ }
+} if ($SZ==4);
+
+########### SHA512
+$Xload = sub {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
+
+$code.=<<___ if ($i==0);
+ ld [$inp+0],%l0
+ ld [$inp+4],%l1
+ ld [$inp+8],%l2
+ ld [$inp+12],%l3
+ ld [$inp+16],%l4
+ ld [$inp+20],%l5
+ ld [$inp+24],%l6
+ cmp $tmp31,0
+ ld [$inp+28],%l7
+___
+$code.=<<___ if ($i<15);
+ sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
+ add $tmp31,32,$tmp0
+ sllx @pair[0],$tmp0,$tmp1
+ `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
+ srlx @pair[2],$tmp32,@pair[1]
+ or $tmp1,$tmp2,$tmp2
+ or @pair[1],$tmp2,$tmp2
+ `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
+ add $h,$tmp2,$T1
+ $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
+___
+$code.=<<___ if ($i==12);
+ bnz,a,pn %icc,.+8
+ ld [$inp+128],%l0
+___
+$code.=<<___ if ($i==15);
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
+ sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
+ add $tmp31,32,$tmp0
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
+ sllx @pair[0],$tmp0,$tmp1
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
+ srlx @pair[2],$tmp32,@pair[1]
+ or $tmp1,$tmp2,$tmp2
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
+ or @pair[1],$tmp2,$tmp2
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
+ add $h,$tmp2,$T1
+ $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
+___
+} if ($SZ==8);
+
+########### common
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+
+ if ($i<16) {
+ &$Xload(@_);
+ } else {
+ $code.="\tadd $h,$T1,$T1\n";
+ }
+
+$code.=<<___;
+ $SRL $e,@Sigma1[0],$h !! $i
+ xor $f,$g,$tmp2
+ $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
+ and $e,$tmp2,$tmp2
+ $SRL $e,@Sigma1[1],$tmp0
+ xor $tmp1,$h,$h
+ $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
+ xor $tmp0,$h,$h
+ $SRL $e,@Sigma1[2],$tmp0
+ xor $tmp1,$h,$h
+ $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
+ xor $tmp0,$h,$h
+ xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
+ xor $tmp1,$h,$tmp0 ! Sigma1(e)
+
+ $SRL $a,@Sigma0[0],$h
+ add $tmp2,$T1,$T1
+ $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
+ $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
+ add $tmp0,$T1,$T1
+ $SRL $a,@Sigma0[1],$tmp0
+ xor $tmp1,$h,$h
+ $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
+ xor $tmp0,$h,$h
+ $SRL $a,@Sigma0[2],$tmp0
+ xor $tmp1,$h,$h
+ $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
+ xor $tmp0,$h,$h
+ xor $tmp1,$h,$h ! Sigma0(a)
+
+ or $a,$b,$tmp0
+ and $a,$b,$tmp1
+ and $c,$tmp0,$tmp0
+ or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
+ add $tmp2,$T1,$T1 ! +=K[$i]
+ add $tmp1,$h,$h
+
+ add $T1,$d,$d
+ add $T1,$h,$h
+___
+}
+
+########### SHA256
+$BODY_16_XX = sub {
+my $i=@_[0];
+my $xi;
+
+ if ($i&1) {
+ $xi=$tmp32;
+ $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
+ } else {
+ $xi=@X[(($i+1)/2)%8];
+ }
+$code.=<<___;
+ srl $xi,@sigma0[0],$T1 !! Xupdate($i)
+ sll $xi,`32-@sigma0[2]`,$tmp1
+ srl $xi,@sigma0[1],$tmp0
+ xor $tmp1,$T1,$T1
+ sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
+ xor $tmp0,$T1,$T1
+ srl $xi,@sigma0[2],$tmp0
+ xor $tmp1,$T1,$T1
+___
+ if ($i&1) {
+ $xi=@X[(($i+14)/2)%8];
+ } else {
+ $xi=$tmp32;
+ $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
+ }
+$code.=<<___;
+ srl $xi,@sigma1[0],$tmp2
+ xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
+ sll $xi,`32-@sigma1[2]`,$tmp1
+ srl $xi,@sigma1[1],$tmp0
+ xor $tmp1,$tmp2,$tmp2
+ sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
+ xor $tmp0,$tmp2,$tmp2
+ srl $xi,@sigma1[2],$tmp0
+ xor $tmp1,$tmp2,$tmp2
+___
+ if ($i&1) {
+ $xi=@X[($i/2)%8];
+$code.=<<___;
+ srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
+ xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
+ srl @X[($i/2)%8],0,$tmp0
+ add $tmp2,$tmp1,$tmp1
+ add $xi,$T1,$T1 ! +=X[i]
+ xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
+ add $tmp1,$T1,$T1
+
+ srl $T1,0,$T1
+ or $T1,@X[($i/2)%8],@X[($i/2)%8]
+___
+ } else {
+ $xi=@X[(($i+9)/2)%8];
+$code.=<<___;
+ srlx @X[($i/2)%8],32,$tmp1 ! X[i]
+ xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
+ add $xi,$T1,$T1 ! +=X[i+9]
+ add $tmp2,$tmp1,$tmp1
+ srl @X[($i/2)%8],0,@X[($i/2)%8]
+ add $tmp1,$T1,$T1
+
+ sllx $T1,32,$tmp0
+ or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
+___
+ }
+ &BODY_00_15(@_);
+} if ($SZ==4);
+
+########### SHA512
+$BODY_16_XX = sub {
+my $i=@_[0];
+my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
+
+$code.=<<___;
+ sllx %l2,32,$tmp0 !! Xupdate($i)
+ or %l3,$tmp0,$tmp0
+
+ srlx $tmp0,@sigma0[0],$T1
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
+ sllx $tmp0,`64-@sigma0[2]`,$tmp1
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
+ srlx $tmp0,@sigma0[1],$tmp0
+ xor $tmp1,$T1,$T1
+ sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
+ xor $tmp0,$T1,$T1
+ srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
+ xor $tmp1,$T1,$T1
+ sllx %l6,32,$tmp2
+ xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
+ or %l7,$tmp2,$tmp2
+
+ srlx $tmp2,@sigma1[0],$tmp1
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
+ sllx $tmp2,`64-@sigma1[2]`,$tmp0
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
+ srlx $tmp2,@sigma1[1],$tmp2
+ xor $tmp0,$tmp1,$tmp1
+ sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
+ xor $tmp2,$tmp1,$tmp1
+ srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
+ xor $tmp0,$tmp1,$tmp1
+ sllx %l4,32,$tmp0
+ xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
+ or %l5,$tmp0,$tmp0
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
+
+ sllx %l0,32,$tmp2
+ add $tmp1,$T1,$T1
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
+ or %l1,$tmp2,$tmp2
+ add $tmp0,$T1,$T1 ! +=X[$i+9]
+ ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
+ add $tmp2,$T1,$T1 ! +=X[$i]
+ $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
+___
+ &BODY_00_15(@_);
+} if ($SZ==8);
+
+$code.=<<___;
+#include "sparc_arch.h"
+#include <openssl/fipssyms.h>
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+.section ".text",#alloc,#execinstr
+
+.align 64
+K${label}:
+.type K${label},#object
+___
+if ($SZ==4) {
+$code.=<<___;
+ .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+ .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+ .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+ .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+ .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+ .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+ .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+ .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+ .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+ .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+ .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+ .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+ .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+ .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+ .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+ .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+ .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+ .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+ .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+ .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+ .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+ .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+ .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+ .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+ .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+ .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+ .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+ .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+ .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+ .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+ .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+ .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+ .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+ .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+ .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+ .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+ .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+ .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+ .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+ .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+ .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+___
+}
+$code.=<<___;
+.size K${label},.-K${label}
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl sha${label}_block_data_order
+.align 32
+sha${label}_block_data_order:
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
+
+ andcc %g1, CFR_SHA${label}, %g0
+ be .Lsoftware
+ nop
+___
+$code.=<<___ if ($SZ==8); # SHA512
+ ldd [%o0 + 0x00], %f0 ! load context
+ ldd [%o0 + 0x08], %f2
+ ldd [%o0 + 0x10], %f4
+ ldd [%o0 + 0x18], %f6
+ ldd [%o0 + 0x20], %f8
+ ldd [%o0 + 0x28], %f10
+ andcc %o1, 0x7, %g0
+ ldd [%o0 + 0x30], %f12
+ bne,pn %icc, .Lhwunaligned
+ ldd [%o0 + 0x38], %f14
+
+.Lhwaligned_loop:
+ ldd [%o1 + 0x00], %f16
+ ldd [%o1 + 0x08], %f18
+ ldd [%o1 + 0x10], %f20
+ ldd [%o1 + 0x18], %f22
+ ldd [%o1 + 0x20], %f24
+ ldd [%o1 + 0x28], %f26
+ ldd [%o1 + 0x30], %f28
+ ldd [%o1 + 0x38], %f30
+ ldd [%o1 + 0x40], %f32
+ ldd [%o1 + 0x48], %f34
+ ldd [%o1 + 0x50], %f36
+ ldd [%o1 + 0x58], %f38
+ ldd [%o1 + 0x60], %f40
+ ldd [%o1 + 0x68], %f42
+ ldd [%o1 + 0x70], %f44
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x78], %f46
+ add %o1, 0x80, %o1
+ prefetch [%o1 + 63], 20
+ prefetch [%o1 + 64+63], 20
+
+ .word 0x81b02860 ! SHA512
+
+ bne,pt SIZE_T_CC, .Lhwaligned_loop
+ nop
+
+.Lhwfinish:
+ std %f0, [%o0 + 0x00] ! store context
+ std %f2, [%o0 + 0x08]
+ std %f4, [%o0 + 0x10]
+ std %f6, [%o0 + 0x18]
+ std %f8, [%o0 + 0x20]
+ std %f10, [%o0 + 0x28]
+ std %f12, [%o0 + 0x30]
+ retl
+ std %f14, [%o0 + 0x38]
+
+.align 16
+.Lhwunaligned:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f18
+.Lhwunaligned_loop:
+ ldd [%o1 + 0x08], %f20
+ ldd [%o1 + 0x10], %f22
+ ldd [%o1 + 0x18], %f24
+ ldd [%o1 + 0x20], %f26
+ ldd [%o1 + 0x28], %f28
+ ldd [%o1 + 0x30], %f30
+ ldd [%o1 + 0x38], %f32
+ ldd [%o1 + 0x40], %f34
+ ldd [%o1 + 0x48], %f36
+ ldd [%o1 + 0x50], %f38
+ ldd [%o1 + 0x58], %f40
+ ldd [%o1 + 0x60], %f42
+ ldd [%o1 + 0x68], %f44
+ ldd [%o1 + 0x70], %f46
+ ldd [%o1 + 0x78], %f48
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x80], %f50
+ add %o1, 0x80, %o1
+ prefetch [%o1 + 63], 20
+ prefetch [%o1 + 64+63], 20
+
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+ faligndata %f26, %f28, %f24
+ faligndata %f28, %f30, %f26
+ faligndata %f30, %f32, %f28
+ faligndata %f32, %f34, %f30
+ faligndata %f34, %f36, %f32
+ faligndata %f36, %f38, %f34
+ faligndata %f38, %f40, %f36
+ faligndata %f40, %f42, %f38
+ faligndata %f42, %f44, %f40
+ faligndata %f44, %f46, %f42
+ faligndata %f46, %f48, %f44
+ faligndata %f48, %f50, %f46
+
+ .word 0x81b02860 ! SHA512
+
+ bne,pt SIZE_T_CC, .Lhwunaligned_loop
+ for %f50, %f50, %f18 ! %f18=%f50
+
+ ba .Lhwfinish
+ nop
+___
+$code.=<<___ if ($SZ==4); # SHA256
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ ld [%o0 + 0x0c], %f3
+ ld [%o0 + 0x10], %f4
+ ld [%o0 + 0x14], %f5
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x18], %f6
+ bne,pn %icc, .Lhwunaligned
+ ld [%o0 + 0x1c], %f7
+
+.Lhwloop:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x38], %f22
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ .word 0x81b02840 ! SHA256
+
+ bne,pt SIZE_T_CC, .Lhwloop
+ nop
+
+.Lhwfinish:
+ st %f0, [%o0 + 0x00] ! store context
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ st %f4, [%o0 + 0x10]
+ st %f5, [%o0 + 0x14]
+ st %f6, [%o0 + 0x18]
+ retl
+ st %f7, [%o0 + 0x1c]
+
+.align 8
+.Lhwunaligned:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ subcc %o2, 1, %o2 ! done yet?
+ ldd [%o1 + 0x40], %f26
+ add %o1, 0x40, %o1
+ prefetch [%o1 + 63], 20
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ .word 0x81b02840 ! SHA256
+
+ bne,pt SIZE_T_CC, .Lhwunaligned_loop
+ for %f26, %f26, %f10 ! %f10=%f26
+
+ ba .Lhwfinish
+ nop
+___
+$code.=<<___;
+.align 16
+.Lsoftware:
+ save %sp,-STACK_FRAME-$locals,%sp
+ and $inp,`$align-1`,$tmp31
+ sllx $len,`log(16*$SZ)/log(2)`,$len
+ andn $inp,`$align-1`,$inp
+ sll $tmp31,3,$tmp31
+ add $inp,$len,$len
+___
+$code.=<<___ if ($SZ==8); # SHA512
+ mov 32,$tmp32
+ sub $tmp32,$tmp31,$tmp32
+___
+$code.=<<___;
+.Lpic: call .+8
+ add %o7,K${label}-.Lpic,$Ktbl
+
+ $LD [$ctx+`0*$SZ`],$A
+ $LD [$ctx+`1*$SZ`],$B
+ $LD [$ctx+`2*$SZ`],$C
+ $LD [$ctx+`3*$SZ`],$D
+ $LD [$ctx+`4*$SZ`],$E
+ $LD [$ctx+`5*$SZ`],$F
+ $LD [$ctx+`6*$SZ`],$G
+ $LD [$ctx+`7*$SZ`],$H
+
+.Lloop:
+___
+for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".L16_xx:\n";
+for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ and $tmp2,0xfff,$tmp2
+ cmp $tmp2,$lastK
+ bne .L16_xx
+ add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
+
+___
+$code.=<<___ if ($SZ==4); # SHA256
+ $LD [$ctx+`0*$SZ`],@X[0]
+ $LD [$ctx+`1*$SZ`],@X[1]
+ $LD [$ctx+`2*$SZ`],@X[2]
+ $LD [$ctx+`3*$SZ`],@X[3]
+ $LD [$ctx+`4*$SZ`],@X[4]
+ $LD [$ctx+`5*$SZ`],@X[5]
+ $LD [$ctx+`6*$SZ`],@X[6]
+ $LD [$ctx+`7*$SZ`],@X[7]
+
+ add $A,@X[0],$A
+ $ST $A,[$ctx+`0*$SZ`]
+ add $B,@X[1],$B
+ $ST $B,[$ctx+`1*$SZ`]
+ add $C,@X[2],$C
+ $ST $C,[$ctx+`2*$SZ`]
+ add $D,@X[3],$D
+ $ST $D,[$ctx+`3*$SZ`]
+ add $E,@X[4],$E
+ $ST $E,[$ctx+`4*$SZ`]
+ add $F,@X[5],$F
+ $ST $F,[$ctx+`5*$SZ`]
+ add $G,@X[6],$G
+ $ST $G,[$ctx+`6*$SZ`]
+ add $H,@X[7],$H
+ $ST $H,[$ctx+`7*$SZ`]
+___
+$code.=<<___ if ($SZ==8); # SHA512
+ ld [$ctx+`0*$SZ+0`],%l0
+ ld [$ctx+`0*$SZ+4`],%l1
+ ld [$ctx+`1*$SZ+0`],%l2
+ ld [$ctx+`1*$SZ+4`],%l3
+ ld [$ctx+`2*$SZ+0`],%l4
+ ld [$ctx+`2*$SZ+4`],%l5
+ ld [$ctx+`3*$SZ+0`],%l6
+
+ sllx %l0,32,$tmp0
+ ld [$ctx+`3*$SZ+4`],%l7
+ sllx %l2,32,$tmp1
+ or %l1,$tmp0,$tmp0
+ or %l3,$tmp1,$tmp1
+ add $tmp0,$A,$A
+ add $tmp1,$B,$B
+ $ST $A,[$ctx+`0*$SZ`]
+ sllx %l4,32,$tmp2
+ $ST $B,[$ctx+`1*$SZ`]
+ sllx %l6,32,$T1
+ or %l5,$tmp2,$tmp2
+ or %l7,$T1,$T1
+ add $tmp2,$C,$C
+ $ST $C,[$ctx+`2*$SZ`]
+ add $T1,$D,$D
+ $ST $D,[$ctx+`3*$SZ`]
+
+ ld [$ctx+`4*$SZ+0`],%l0
+ ld [$ctx+`4*$SZ+4`],%l1
+ ld [$ctx+`5*$SZ+0`],%l2
+ ld [$ctx+`5*$SZ+4`],%l3
+ ld [$ctx+`6*$SZ+0`],%l4
+ ld [$ctx+`6*$SZ+4`],%l5
+ ld [$ctx+`7*$SZ+0`],%l6
+
+ sllx %l0,32,$tmp0
+ ld [$ctx+`7*$SZ+4`],%l7
+ sllx %l2,32,$tmp1
+ or %l1,$tmp0,$tmp0
+ or %l3,$tmp1,$tmp1
+ add $tmp0,$E,$E
+ add $tmp1,$F,$F
+ $ST $E,[$ctx+`4*$SZ`]
+ sllx %l4,32,$tmp2
+ $ST $F,[$ctx+`5*$SZ`]
+ sllx %l6,32,$T1
+ or %l5,$tmp2,$tmp2
+ or %l7,$T1,$T1
+ add $tmp2,$G,$G
+ $ST $G,[$ctx+`6*$SZ`]
+ add $T1,$H,$H
+ $ST $H,[$ctx+`7*$SZ`]
+___
+$code.=<<___;
+ add $inp,`16*$SZ`,$inp ! advance inp
+ cmp $inp,$len
+ bne SIZE_T_CC,.Lloop
+ sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
+
+ ret
+ restore
+.type sha${label}_block_data_order,#function
+.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
+.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = ( "faligndata" => 0x048,
+ "for" => 0x07c );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+ foreach ($rs1,$rs2,$rd) {
+ if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
+ else { return $ref; }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00300|$rd<<25|$rs1<<14|$rs2,
+ $ref;
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /ge;
+ s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unalignaddr($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparc_arch.h Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,101 @@
+#ifndef __SPARC_ARCH_H__
+# define __SPARC_ARCH_H__
+
+# define SPARCV9_TICK_PRIVILEGED (1<<0)
+# define SPARCV9_PREFER_FPU (1<<1)
+# define SPARCV9_VIS1 (1<<2)
+# define SPARCV9_VIS2 (1<<3)/* reserved */
+# define SPARCV9_FMADD (1<<4)/* reserved for SPARC64 V */
+# define SPARCV9_BLK (1<<5)/* VIS1 block copy */
+# define SPARCV9_VIS3 (1<<6)
+# define SPARCV9_RANDOM (1<<7)
+# define SPARCV9_64BIT_STACK (1<<8)
+
+/*
+ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
+ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
+ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
+ */
+# define CFR_AES 0x00000001/* Supports AES opcodes */
+# define CFR_DES 0x00000002/* Supports DES opcodes */
+# define CFR_KASUMI 0x00000004/* Supports KASUMI opcodes */
+# define CFR_CAMELLIA 0x00000008/* Supports CAMELLIA opcodes */
+# define CFR_MD5 0x00000010/* Supports MD5 opcodes */
+# define CFR_SHA1 0x00000020/* Supports SHA1 opcodes */
+# define CFR_SHA256 0x00000040/* Supports SHA256 opcodes */
+# define CFR_SHA512 0x00000080/* Supports SHA512 opcodes */
+# define CFR_MPMUL 0x00000100/* Supports MPMUL opcodes */
+# define CFR_MONTMUL 0x00000200/* Supports MONTMUL opcodes */
+# define CFR_MONTSQR 0x00000400/* Supports MONTSQR opcodes */
+# define CFR_CRC32C 0x00000800/* Supports CRC32C opcodes */
+
+# if defined(OPENSSL_PIC) && !defined(__PIC__)
+# define __PIC__
+# endif
+
+# if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
+# define __arch64__
+# endif
+
+# define SPARC_PIC_THUNK(reg) \
+ .align 32; \
+.Lpic_thunk: \
+ jmp %o7 + 8; \
+ add %o7, reg, reg;
+
+# define SPARC_PIC_THUNK_CALL(reg) \
+ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
+ call .Lpic_thunk; \
+ or reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
+
+# if 1
+# define SPARC_SETUP_GOT_REG(reg) SPARC_PIC_THUNK_CALL(reg)
+# else
+# define SPARC_SETUP_GOT_REG(reg) \
+ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
+ call .+8; \
+ or reg,%lo(_GLOBAL_OFFSET_TABLE_+4), reg; \
+ add %o7, reg, reg
+# endif
+
+# if defined(__arch64__)
+
+# define SPARC_LOAD_ADDRESS(SYM, reg) \
+ setx SYM, %o7, reg;
+# define LDPTR ldx
+# define SIZE_T_CC %xcc
+# define STACK_FRAME 192
+# define STACK_BIAS 2047
+# define STACK_7thARG (STACK_BIAS+176)
+
+# else
+
+# define SPARC_LOAD_ADDRESS(SYM, reg) \
+ set SYM, reg;
+# define LDPTR ld
+# define SIZE_T_CC %icc
+# define STACK_FRAME 112
+# define STACK_BIAS 0
+# define STACK_7thARG 92
+# define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg)
+
+# endif
+
+# ifdef __PIC__
+# undef SPARC_LOAD_ADDRESS
+# undef SPARC_LOAD_ADDRESS_LEAF
+# define SPARC_LOAD_ADDRESS(SYM, reg) \
+ SPARC_SETUP_GOT_REG(reg); \
+ sethi %hi(SYM), %o7; \
+ or %o7, %lo(SYM), %o7; \
+ LDPTR [reg + %o7], reg;
+# endif
+
+# ifndef SPARC_LOAD_ADDRESS_LEAF
+# define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) \
+ mov %o7, tmp; \
+ SPARC_LOAD_ADDRESS(SYM, reg) \
+ mov tmp, %o7;
+# endif
+
+#endif /* __SPARC_ARCH_H__ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparct4-mont.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,1223 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <[email protected]> and Andy Polyakov
+# <[email protected]>. The module is licensed under 2-clause BSD
+# license. November 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# Montgomery squaring-n-multiplication module for SPARC T4.
+#
+# The module consists of three parts:
+#
+# 1) collection of "single-op" subroutines that perform single
+# operation, Montgomery squaring or multiplication, on 512-,
+# 1024-, 1536- and 2048-bit operands;
+# 2) collection of "multi-op" subroutines that perform 5 squaring and
+# 1 multiplication operations on operands of above lengths;
+# 3) fall-back and helper VIS3 subroutines.
+#
+# RSA sign is dominated by multi-op subroutine, while RSA verify and
+# DSA - by single-op. Special note about 4096-bit RSA verify result.
+# Operands are too long for dedicated hardware and it's handled by
+# VIS3 code, which is why you don't see any improvement. It's surely
+# possible to improve it [by deploying 'mpmul' instruction], maybe in
+# the future...
+#
+# Performance improvement.
+#
+# 64-bit process, VIS3:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
+# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
+# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
+# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
+# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
+#
+# 64-bit process, this module:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
+# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
+# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
+# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
+# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
+#
+######################################################################
+# 32-bit process, VIS3:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
+# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
+# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
+# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
+# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
+#
+# 32-bit process, this module:
+# sign verify sign/s verify/s
+# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
+# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
+# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
+# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
+# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
+#
+# 32-bit code is prone to performance degradation as interrupt rate
+# dispatched to CPU executing the code grows. This is because in
+# standard process of handling interrupt in 32-bit process context
+# upper halves of most integer registers used as input or output are
+# zeroed. This renders result invalid, and operation has to be re-run.
+# If CPU is "bothered" with timer interrupts only, the penalty is
+# hardly measurable. But in order to mitigate this problem for higher
+# interrupt rates contemporary Linux kernel recognizes biased stack
+# even in 32-bit process context and preserves full register contents.
+# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
+# for details.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+$code.=<<___;
+#include "sparc_arch.h"
+#include <openssl/fipssyms.h>
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+.section ".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+___
+
+########################################################################
+# Register layout for mont[mul|sqr] instructions.
+# For details see "Oracle SPARC Architecture 2011" manual at
+# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
+#
+my @R=map("%f".2*$_,(0..11,30,31,12..29));
+my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
+my @A=(@N[0..13],@R[14..31]);
+my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
+
+########################################################################
+# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
+# const u64 *np,const BN_ULONG *n0);
+#
+sub generate_bn_mul_mont_t4() {
+my $NUM=shift;
+my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
+
+$code.=<<___;
+.globl bn_mul_mont_t4_$NUM
+.align 32
+bn_mul_mont_t4_$NUM:
+#ifdef __arch64__
+ mov 0,$sentinel
+ mov -128,%g4
+#elif defined(SPARCV9_64BIT_STACK)
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
+ mov -2047,%g4
+ and %g1,SPARCV9_64BIT_STACK,%g1
+ movrz %g1,0,%g4
+ mov -1,$sentinel
+ add %g4,-128,%g4
+#else
+ mov -1,$sentinel
+ mov -128,%g4
+#endif
+ sllx $sentinel,32,$sentinel
+ save %sp,%g4,%sp
+#ifndef __arch64__
+ save %sp,-128,%sp ! warm it up
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ restore
+ restore
+ restore
+ restore
+ restore
+ restore
+#endif
+ and %sp,1,%g4
+ or $sentinel,%fp,%fp
+ or %g4,$sentinel,$sentinel
+
+ ! copy arguments to global registers
+ mov %i0,$rp
+ mov %i1,$ap
+ mov %i2,$bp
+ mov %i3,$np
+ ld [%i4+0],%f1 ! load *n0
+ ld [%i4+4],%f0
+ fsrc2 %f0,%f60
+___
+
+# load ap[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@A[$i+1]:"%o7";
+$code.=<<___;
+ ld [$ap+$i*8+0],$lo
+ ld [$ap+$i*8+4],@A[$i]
+ sllx @A[$i],32,@A[$i]
+ or $lo,@A[$i],@A[$i]
+___
+}
+for(; $i<$NUM; $i++) {
+my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
+$code.=<<___;
+ ld [$ap+$i*8+0],$lo
+ ld [$ap+$i*8+4],$hi
+ fsrc2 $hi,@A[$i]
+___
+}
+# load np[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@N[$i+1]:"%o7";
+$code.=<<___;
+ ld [$np+$i*8+0],$lo
+ ld [$np+$i*8+4],@N[$i]
+ sllx @N[$i],32,@N[$i]
+ or $lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<28 && $i<$NUM; $i++) {
+my $lo=$i<27?@N[$i+1]:"%o7";
+$code.=<<___;
+ ld [$np+$i*8+0],$lo
+ ld [$np+$i*8+4],@N[$i]
+ sllx @N[$i],32,@N[$i]
+ or $lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
+$code.=<<___;
+ ld [$np+$i*8+0],$lo
+ ld [$np+$i*8+4],@N[$i]
+ sllx @N[$i],32,@N[$i]
+ or $lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+ cmp $ap,$bp
+ be SIZE_T_CC,.Lmsquare_$NUM
+ nop
+___
+
+# load bp[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@B[$i+1]:"%o7";
+$code.=<<___;
+ ld [$bp+$i*8+0],$lo
+ ld [$bp+$i*8+4],@B[$i]
+ sllx @B[$i],32,@B[$i]
+ or $lo,@B[$i],@B[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
+$code.=<<___;
+ ld [$bp+$i*8+0],$lo
+ ld [$bp+$i*8+4],@B[$i]
+ sllx @B[$i],32,@B[$i]
+ or $lo,@B[$i],@B[$i]
+___
+}
+# magic ################################################################
+$code.=<<___;
+ .word 0x81b02920+$NUM-1 ! montmul $NUM-1
+.Lmresume_$NUM:
+ fbu,pn %fcc3,.Lmabort_$NUM
+#ifndef __arch64__
+ and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Lmabort_$NUM
+#endif
+ nop
+#ifdef __arch64__
+ restore
+ restore
+ restore
+ restore
+ restore
+#else
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Lmabort1_$NUM
+ restore
+#endif
+___
+
+# save tp[$NUM] ########################################################
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ movxtod @A[$i],@R[$i]
+___
+}
+$code.=<<___;
+#ifdef __arch64__
+ restore
+#else
+ and %fp,$sentinel,$sentinel
+ restore
+ and $sentinel,1,%o7
+ and %fp,$sentinel,$sentinel
+ srl %fp,0,%fp ! just in case?
+ or %o7,$sentinel,$sentinel
+ brz,a,pn $sentinel,.Lmdone_$NUM
+ mov 0,%i0 ! return failure
+#endif
+___
+for($i=0; $i<12 && $i<$NUM; $i++) {
+@R[$i] =~ /%f([0-9]+)/;
+my $lo = "%f".($1+1);
+$code.=<<___;
+ st $lo,[$rp+$i*8+0]
+ st @R[$i],[$rp+$i*8+4]
+___
+}
+for(; $i<$NUM; $i++) {
+my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
+$code.=<<___;
+ fsrc2 @R[$i],$hi
+ st $lo,[$rp+$i*8+0]
+ st $hi,[$rp+$i*8+4]
+___
+}
+$code.=<<___;
+ mov 1,%i0 ! return success
+.Lmdone_$NUM:
+ ret
+ restore
+
+.Lmabort_$NUM:
+ restore
+ restore
+ restore
+ restore
+ restore
+.Lmabort1_$NUM:
+ restore
+
+ mov 0,%i0 ! return failure
+ ret
+ restore
+
+.align 32
+.Lmsquare_$NUM:
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+ .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
+ ba .Lmresume_$NUM
+ nop
+.type bn_mul_mont_t4_$NUM, #function
+.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
+___
+}
+
+for ($i=8;$i<=32;$i+=8) {
+ &generate_bn_mul_mont_t4($i);
+}
+
+########################################################################
+#
+sub load_ccr {
+my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
+$code.=<<___;
+ srl $pwr, 2, %o4
+ and $pwr, 3, %o5
+ and %o4, 7, %o4
+ sll %o5, 3, %o5 ! offset within first cache line
+ add %o5, $ptbl, $ptbl ! of the pwrtbl
+ or %g0, 1, %o5
+ sll %o5, %o4, $ccr
+___
+$code.=<<___ if (!$skip_wr);
+ wr $ccr, %g0, %ccr
+___
+}
+sub load_b_pair {
+my ($pwrtbl,$B0,$B1)=@_;
+
+$code.=<<___;
+ ldx [$pwrtbl+0*32], $B0
+ ldx [$pwrtbl+8*32], $B1
+ ldx [$pwrtbl+1*32], %o4
+ ldx [$pwrtbl+9*32], %o5
+ movvs %icc, %o4, $B0
+ ldx [$pwrtbl+2*32], %o4
+ movvs %icc, %o5, $B1
+ ldx [$pwrtbl+10*32],%o5
+ move %icc, %o4, $B0
+ ldx [$pwrtbl+3*32], %o4
+ move %icc, %o5, $B1
+ ldx [$pwrtbl+11*32],%o5
+ movneg %icc, %o4, $B0
+ ldx [$pwrtbl+4*32], %o4
+ movneg %icc, %o5, $B1
+ ldx [$pwrtbl+12*32],%o5
+ movcs %xcc, %o4, $B0
+ ldx [$pwrtbl+5*32],%o4
+ movcs %xcc, %o5, $B1
+ ldx [$pwrtbl+13*32],%o5
+ movvs %xcc, %o4, $B0
+ ldx [$pwrtbl+6*32], %o4
+ movvs %xcc, %o5, $B1
+ ldx [$pwrtbl+14*32],%o5
+ move %xcc, %o4, $B0
+ ldx [$pwrtbl+7*32], %o4
+ move %xcc, %o5, $B1
+ ldx [$pwrtbl+15*32],%o5
+ movneg %xcc, %o4, $B0
+ add $pwrtbl,16*32, $pwrtbl
+ movneg %xcc, %o5, $B1
+___
+}
+sub load_b {
+my ($pwrtbl,$Bi)=@_;
+
+$code.=<<___;
+ ldx [$pwrtbl+0*32], $Bi
+ ldx [$pwrtbl+1*32], %o4
+ ldx [$pwrtbl+2*32], %o5
+ movvs %icc, %o4, $Bi
+ ldx [$pwrtbl+3*32], %o4
+ move %icc, %o5, $Bi
+ ldx [$pwrtbl+4*32], %o5
+ movneg %icc, %o4, $Bi
+ ldx [$pwrtbl+5*32], %o4
+ movcs %xcc, %o5, $Bi
+ ldx [$pwrtbl+6*32], %o5
+ movvs %xcc, %o4, $Bi
+ ldx [$pwrtbl+7*32], %o4
+ move %xcc, %o5, $Bi
+ add $pwrtbl,8*32, $pwrtbl
+ movneg %xcc, %o4, $Bi
+___
+}
+
+########################################################################
+# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
+# const u64 *pwrtbl,int pwr,int stride);
+#
+sub generate_bn_pwr5_mont_t4() {
+my $NUM=shift;
+my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
+
+$code.=<<___;
+.globl bn_pwr5_mont_t4_$NUM
+.align 32
+bn_pwr5_mont_t4_$NUM:
+#ifdef __arch64__
+ mov 0,$sentinel
+ mov -128,%g4
+#elif defined(SPARCV9_64BIT_STACK)
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
+ mov -2047,%g4
+ and %g1,SPARCV9_64BIT_STACK,%g1
+ movrz %g1,0,%g4
+ mov -1,$sentinel
+ add %g4,-128,%g4
+#else
+ mov -1,$sentinel
+ mov -128,%g4
+#endif
+ sllx $sentinel,32,$sentinel
+ save %sp,%g4,%sp
+#ifndef __arch64__
+ save %sp,-128,%sp ! warm it up
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ save %sp,-128,%sp
+ restore
+ restore
+ restore
+ restore
+ restore
+ restore
+#endif
+ and %sp,1,%g4
+ or $sentinel,%fp,%fp
+ or %g4,$sentinel,$sentinel
+
+ ! copy arguments to global registers
+ mov %i0,$tp
+ mov %i1,$np
+ ld [%i2+0],%f1 ! load *n0
+ ld [%i2+4],%f0
+ mov %i3,$pwrtbl
+ srl %i4,%g0,%i4 ! pack last arguments
+ sllx %i5,32,$pwr
+ or %i4,$pwr,$pwr
+ fsrc2 %f0,%f60
+___
+
+# load tp[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$tp+$i*8],@A[$i]
+___
+}
+for(; $i<$NUM; $i++) {
+$code.=<<___;
+ ldd [$tp+$i*8],@A[$i]
+___
+}
+# load np[$NUM] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$np+$i*8],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<28 && $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$np+$i*8],@N[$i]
+___
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+$code.=<<___;
+ ldx [$np+$i*8],@N[$i]
+___
+}
+# load pwrtbl[pwr] ########################################################
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+
+ srlx $pwr, 32, %o4 ! unpack $pwr
+ srl $pwr, %g0, %o5
+ sub %o4, 5, %o4
+ mov $pwrtbl, %o7
+ sllx %o4, 32, $pwr ! re-pack $pwr
+ or %o5, $pwr, $pwr
+ srl %o5, %o4, %o5
+___
+ &load_ccr("%o7","%o5","%o4");
+$code.=<<___;
+ b .Lstride_$NUM
+ nop
+.align 16
+.Lstride_$NUM:
+___
+for($i=0; $i<14 && $i<$NUM; $i+=2) {
+ &load_b_pair("%o7",@B[$i],@B[$i+1]);
+}
+$code.=<<___;
+ save %sp,-128,%sp; or $sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i+=2) {
+ &load_b_pair("%i7",@B[$i],@B[$i+1]);
+}
+$code.=<<___;
+ srax $pwr, 32, %o4 ! unpack $pwr
+ srl $pwr, %g0, %o5
+ sub %o4, 5, %o4
+ mov $pwrtbl, %i7
+ sllx %o4, 32, $pwr ! re-pack $pwr
+ or %o5, $pwr, $pwr
+ srl %o5, %o4, %o5
+___
+ &load_ccr("%i7","%o5","%o4",1);
+
+# magic ################################################################
+for($i=0; $i<5; $i++) {
+$code.=<<___;
+ .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
+ fbu,pn %fcc3,.Labort_$NUM
+#ifndef __arch64__
+ and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Labort_$NUM
+#endif
+ nop
+___
+}
+$code.=<<___;
+ wr %o4, %g0, %ccr
+ .word 0x81b02920+$NUM-1 ! montmul $NUM-1
+ fbu,pn %fcc3,.Labort_$NUM
+#ifndef __arch64__
+ and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Labort_$NUM
+#endif
+
+ srax $pwr, 32, %o4
+#ifdef __arch64__
+ brgez %o4,.Lstride_$NUM
+ restore
+ restore
+ restore
+ restore
+ restore
+#else
+ brgez %o4,.Lstride_$NUM
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ restore; and %fp,$sentinel,$sentinel
+ brz,pn $sentinel,.Labort1_$NUM
+ restore
+#endif
+___
+
+# save tp[$NUM] ########################################################
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+ movxtod @A[$i],@R[$i]
+___
+}
+$code.=<<___;
+#ifdef __arch64__
+ restore
+#else
+ and %fp,$sentinel,$sentinel
+ restore
+ and $sentinel,1,%o7
+ and %fp,$sentinel,$sentinel
+ srl %fp,0,%fp ! just in case?
+ or %o7,$sentinel,$sentinel
+ brz,a,pn $sentinel,.Ldone_$NUM
+ mov 0,%i0 ! return failure
+#endif
+___
+for($i=0; $i<$NUM; $i++) {
+$code.=<<___;
+ std @R[$i],[$tp+$i*8]
+___
+}
+$code.=<<___;
+ mov 1,%i0 ! return success
+.Ldone_$NUM:
+ ret
+ restore
+
+.Labort_$NUM:
+ restore
+ restore
+ restore
+ restore
+ restore
+.Labort1_$NUM:
+ restore
+
+ mov 0,%i0 ! return failure
+ ret
+ restore
+.type bn_pwr5_mont_t4_$NUM, #function
+.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
+___
+}
+
+for ($i=8;$i<=32;$i+=8) {
+ &generate_bn_pwr5_mont_t4($i);
+}
+
+{
+########################################################################
+# Fall-back subroutines
+#
+# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
+#
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+ (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0"; # u64 *rp,
+$ap="%o1"; # const u64 *ap,
+$bp="%o2"; # const u64 *bp,
+$np="%o3"; # const u64 *np,
+$n0p="%o4"; # const BN_ULONG *n0,
+$num="%o5"; # int num); # caller ensures that num is >=3
+$code.=<<___;
+.globl bn_mul_mont_t4
+.align 32
+bn_mul_mont_t4:
+ add %sp, STACK_BIAS, %g4 ! real top of stack
+ sll $num, 3, $num ! size in bytes
+ add $num, 63, %g1
+ andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
+ sub %g4, %g1, %g1
+ andn %g1, 63, %g1 ! align at 64 byte
+ sub %g1, STACK_FRAME, %g1 ! new top of stack
+ sub %g1, %g4, %g1
+
+ save %sp, %g1, %sp
+___
+# +-------------------------------+<----- %sp
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 tmp[0] |
+# +-------------------------------+
+# . .
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# . .
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+ ld [$n0p+0], $t0 ! pull n0[0..1] value
+ ld [$n0p+4], $t1
+ add %sp, STACK_BIAS+STACK_FRAME, $tp
+ ldx [$bp+0], $m0 ! m0=bp[0]
+ sllx $t1, 32, $n0
+ add $bp, 8, $bp
+ or $t0, $n0, $n0
+
+ ldx [$ap+0], $aj ! ap[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
+ umulxhi $aj, $m0, $hi0
+
+ ldx [$ap+8], $aj ! ap[1]
+ add $ap, 16, $ap
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $lo0, $n0, $m1 ! "tp[0]"*n0
+
+ mulx $aj, $m0, $alo ! ap[1]*bp[0]
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+
+ ldx [$np+8], $nj ! np[1]
+
+ addcc $lo0, $lo1, $lo1
+ add $np, 16, $np
+ addxc %g0, $hi1, $hi1
+
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .L1st
+ sub $num, 24, $cnt ! cnt=num-3
+
+.align 16
+.L1st:
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0
+
+ ldx [$ap+0], $aj ! ap[j]
+ addcc $nlo, $hi1, $lo1
+ add $ap, 8, $ap
+ addxc $nj, %g0, $hi1 ! nhi=nj
+
+ ldx [$np+0], $nj ! np[j]
+ mulx $aj, $m0, $alo ! ap[j]*bp[0]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp ! tp++
+
+ brnz,pt $cnt, .L1st
+ sub $cnt, 8, $cnt ! j--
+!.L1st
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp
+
+ addcc $hi0, $hi1, $hi1
+ addxc %g0, %g0, $ovf ! upmost overflow bit
+ stxa $hi1, [$tp]0xe2
+ add $tp, 8, $tp
+
+ ba .Louter
+ sub $num, 16, $i ! i=num-2
+
+.align 16
+.Louter:
+ ldx [$bp+0], $m0 ! m0=bp[i]
+ add $bp, 8, $bp
+
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+
+ ldx [$ap+0], $aj ! ap[0]
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
+ ldx [$tp], $tj ! tp[0]
+ umulxhi $aj, $m0, $hi0
+ ldx [$ap+8], $aj ! ap[1]
+ addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
+ mulx $aj, $m0, $alo ! ap[1]*bp[i]
+ addxc %g0, $hi0, $hi0
+ mulx $lo0, $n0, $m1 ! tp[0]*n0
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ add $ap, 16, $ap
+ umulxhi $nj, $m1, $hi1
+ ldx [$np+8], $nj ! np[1]
+ add $np, 16, $np
+ addcc $lo1, $lo0, $lo1
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ addxc %g0, $hi1, $hi1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .Linner
+ sub $num, 24, $cnt ! cnt=num-3
+.align 16
+.Linner:
+ addcc $alo, $hi0, $lo0
+ ldx [$tp+8], $tj ! tp[j]
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ ldx [$ap+0], $aj ! ap[j]
+ add $ap, 8, $ap
+ addcc $nlo, $hi1, $lo1
+ mulx $aj, $m0, $alo ! ap[j]*bp[i]
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ ldx [$np+0], $nj ! np[j]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addxc %g0, $hi0, $hi0
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+ brnz,pt $cnt, .Linner
+ sub $cnt, 8, $cnt
+!.Linner
+ ldx [$tp+8], $tj ! tp[j]
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi0, $hi0
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+
+ subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
+ addxccc $hi1, $hi0, $hi1
+ addxc %g0, %g0, $ovf
+ stx $hi1, [$tp+8]
+ add $tp, 16, $tp
+
+ brnz,pt $i, .Louter
+ sub $i, 8, $i
+
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+ ba .Lsub
+ subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
+
+.align 16
+.Lsub:
+ ldx [$tp], $tj
+ add $tp, 8, $tp
+ ldx [$np+0], $nj
+ add $np, 8, $np
+ subccc $tj, $nj, $t2 ! tp[j]-np[j]
+ srlx $tj, 32, $tj
+ srlx $nj, 32, $nj
+ subccc $tj, $nj, $t3
+ add $rp, 8, $rp
+ st $t2, [$rp-4] ! reverse order
+ st $t3, [$rp-8]
+ brnz,pt $cnt, .Lsub
+ sub $cnt, 8, $cnt
+
+ sub $np, $num, $np ! rewind
+ sub $tp, $num, $tp
+ sub $rp, $num, $rp
+
+ subc $ovf, %g0, $ovf ! handle upmost overflow bit
+ and $tp, $ovf, $ap
+ andn $rp, $ovf, $np
+ or $np, $ap, $ap ! ap=borrow?tp:rp
+ ba .Lcopy
+ sub $num, 8, $cnt
+
+.align 16
+.Lcopy: ! copy or in-place refresh
+ ldx [$ap+0], $t2
+ add $ap, 8, $ap
+ stx %g0, [$tp] ! zap
+ add $tp, 8, $tp
+ stx $t2, [$rp+0]
+ add $rp, 8, $rp
+ brnz $cnt, .Lcopy
+ sub $cnt, 8, $cnt
+
+ mov 1, %o0
+ ret
+ restore
+.type bn_mul_mont_t4, #function
+.size bn_mul_mont_t4, .-bn_mul_mont_t4
+___
+
+# int bn_mul_mont_gather5(
+$rp="%o0"; # u64 *rp,
+$ap="%o1"; # const u64 *ap,
+$bp="%o2"; # const u64 *pwrtbl,
+$np="%o3"; # const u64 *np,
+$n0p="%o4"; # const BN_ULONG *n0,
+$num="%o5"; # int num, # caller ensures that num is >=3
+ # int power);
+$code.=<<___;
+.globl bn_mul_mont_gather5_t4
+.align 32
+bn_mul_mont_gather5_t4:
+ add %sp, STACK_BIAS, %g4 ! real top of stack
+ sll $num, 3, $num ! size in bytes
+ add $num, 63, %g1
+ andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
+ sub %g4, %g1, %g1
+ andn %g1, 63, %g1 ! align at 64 byte
+ sub %g1, STACK_FRAME, %g1 ! new top of stack
+ sub %g1, %g4, %g1
+ LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
+
+ save %sp, %g1, %sp
+___
+# +-------------------------------+<----- %sp
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 tmp[0] |
+# +-------------------------------+
+# . .
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# . .
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+ &load_ccr($bp,"%g4",$ccr);
+ &load_b($bp,$m0,"%o7"); # m0=bp[0]
+
+$code.=<<___;
+ ld [$n0p+0], $t0 ! pull n0[0..1] value
+ ld [$n0p+4], $t1
+ add %sp, STACK_BIAS+STACK_FRAME, $tp
+ sllx $t1, 32, $n0
+ or $t0, $n0, $n0
+
+ ldx [$ap+0], $aj ! ap[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
+ umulxhi $aj, $m0, $hi0
+
+ ldx [$ap+8], $aj ! ap[1]
+ add $ap, 16, $ap
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $lo0, $n0, $m1 ! "tp[0]"*n0
+
+ mulx $aj, $m0, $alo ! ap[1]*bp[0]
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+
+ ldx [$np+8], $nj ! np[1]
+
+ addcc $lo0, $lo1, $lo1
+ add $np, 16, $np
+ addxc %g0, $hi1, $hi1
+
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .L1st_g5
+ sub $num, 24, $cnt ! cnt=num-3
+
+.align 16
+.L1st_g5:
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0
+
+ ldx [$ap+0], $aj ! ap[j]
+ addcc $nlo, $hi1, $lo1
+ add $ap, 8, $ap
+ addxc $nj, %g0, $hi1 ! nhi=nj
+
+ ldx [$np+0], $nj ! np[j]
+ mulx $aj, $m0, $alo ! ap[j]*bp[0]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp ! tp++
+
+ brnz,pt $cnt, .L1st_g5
+ sub $cnt, 8, $cnt ! j--
+!.L1st_g5
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ addxc %g0, $hi1, $hi1
+ stxa $lo1, [$tp]0xe2 ! tp[j-1]
+ add $tp, 8, $tp
+
+ addcc $hi0, $hi1, $hi1
+ addxc %g0, %g0, $ovf ! upmost overflow bit
+ stxa $hi1, [$tp]0xe2
+ add $tp, 8, $tp
+
+ ba .Louter_g5
+ sub $num, 16, $i ! i=num-2
+
+.align 16
+.Louter_g5:
+ wr $ccr, %g0, %ccr
+___
+ &load_b($bp,$m0); # m0=bp[i]
+$code.=<<___;
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+
+ ldx [$ap+0], $aj ! ap[0]
+ ldx [$np+0], $nj ! np[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
+ ldx [$tp], $tj ! tp[0]
+ umulxhi $aj, $m0, $hi0
+ ldx [$ap+8], $aj ! ap[1]
+ addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
+ mulx $aj, $m0, $alo ! ap[1]*bp[i]
+ addxc %g0, $hi0, $hi0
+ mulx $lo0, $n0, $m1 ! tp[0]*n0
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ add $ap, 16, $ap
+ umulxhi $nj, $m1, $hi1
+ ldx [$np+8], $nj ! np[1]
+ add $np, 16, $np
+ addcc $lo1, $lo0, $lo1
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ addxc %g0, $hi1, $hi1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .Linner_g5
+ sub $num, 24, $cnt ! cnt=num-3
+.align 16
+.Linner_g5:
+ addcc $alo, $hi0, $lo0
+ ldx [$tp+8], $tj ! tp[j]
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ ldx [$ap+0], $aj ! ap[j]
+ add $ap, 8, $ap
+ addcc $nlo, $hi1, $lo1
+ mulx $aj, $m0, $alo ! ap[j]*bp[i]
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ ldx [$np+0], $nj ! np[j]
+ add $np, 8, $np
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addxc %g0, $hi0, $hi0
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+ brnz,pt $cnt, .Linner_g5
+ sub $cnt, 8, $cnt
+!.Linner_g5
+ ldx [$tp+8], $tj ! tp[j]
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi0, $hi0
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+
+ subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
+ addxccc $hi1, $hi0, $hi1
+ addxc %g0, %g0, $ovf
+ stx $hi1, [$tp+8]
+ add $tp, 16, $tp
+
+ brnz,pt $i, .Louter_g5
+ sub $i, 8, $i
+
+ sub $ap, $num, $ap ! rewind
+ sub $np, $num, $np
+ sub $tp, $num, $tp
+ ba .Lsub_g5
+ subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
+
+.align 16
+.Lsub_g5:
+ ldx [$tp], $tj
+ add $tp, 8, $tp
+ ldx [$np+0], $nj
+ add $np, 8, $np
+ subccc $tj, $nj, $t2 ! tp[j]-np[j]
+ srlx $tj, 32, $tj
+ srlx $nj, 32, $nj
+ subccc $tj, $nj, $t3
+ add $rp, 8, $rp
+ st $t2, [$rp-4] ! reverse order
+ st $t3, [$rp-8]
+ brnz,pt $cnt, .Lsub_g5
+ sub $cnt, 8, $cnt
+
+ sub $np, $num, $np ! rewind
+ sub $tp, $num, $tp
+ sub $rp, $num, $rp
+
+ subc $ovf, %g0, $ovf ! handle upmost overflow bit
+ and $tp, $ovf, $ap
+ andn $rp, $ovf, $np
+ or $np, $ap, $ap ! ap=borrow?tp:rp
+ ba .Lcopy_g5
+ sub $num, 8, $cnt
+
+.align 16
+.Lcopy_g5: ! copy or in-place refresh
+ ldx [$ap+0], $t2
+ add $ap, 8, $ap
+ stx %g0, [$tp] ! zap
+ add $tp, 8, $tp
+ stx $t2, [$rp+0]
+ add $rp, 8, $rp
+ brnz $cnt, .Lcopy_g5
+ sub $cnt, 8, $cnt
+
+ mov 1, %o0
+ ret
+ restore
+.type bn_mul_mont_gather5_t4, #function
+.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
+___
+}
+
+$code.=<<___;
+.globl bn_flip_t4
+.align 32
+bn_flip_t4:
+.Loop_flip:
+ ld [%o1+0], %o4
+ sub %o2, 1, %o2
+ ld [%o1+4], %o5
+ add %o1, 8, %o1
+ st %o5, [%o0+0]
+ st %o4, [%o0+4]
+ brnz %o2, .Loop_flip
+ add %o0, 8, %o0
+ retl
+ nop
+.type bn_flip_t4, #function
+.size bn_flip_t4, .-bn_flip_t4
+
+.globl bn_flip_n_scatter5_t4
+.align 32
+bn_flip_n_scatter5_t4:
+ sll %o3, 3, %o3
+ srl %o1, 1, %o1
+ add %o3, %o2, %o2 ! &pwrtbl[pwr]
+ sub %o1, 1, %o1
+.Loop_flip_n_scatter5:
+ ld [%o0+0], %o4 ! inp[i]
+ ld [%o0+4], %o5
+ add %o0, 8, %o0
+ sllx %o5, 32, %o5
+ or %o4, %o5, %o5
+ stx %o5, [%o2]
+ add %o2, 32*8, %o2
+ brnz %o1, .Loop_flip_n_scatter5
+ sub %o1, 1, %o1
+ retl
+ nop
+.type bn_flip_n_scatter5_t4, #function
+.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
+
+.globl bn_gather5_t4
+.align 32
+bn_gather5_t4:
+___
+ &load_ccr("%o2","%o3","%g1");
+$code.=<<___;
+ sub %o1, 1, %o1
+.Loop_gather5:
+___
+ &load_b("%o2","%g1");
+$code.=<<___;
+ stx %g1, [%o0]
+ add %o0, 8, %o0
+ brnz %o1, .Loop_gather5
+ sub %o1, 1, %o1
+
+ retl
+ nop
+.type bn_gather5_t4, #function
+.size bn_gather5_t4, .-bn_gather5_t4
+
+.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparcv9-gf2m.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,191 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# October 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: one suitable
+# for all SPARCv9 processors and one for VIS3-capable ones. Former
+# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
+# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
+# ~100-230% faster than gcc-generated code and ~35-90% faster than
+# the pure SPARCv9 code path.
+
+$locals=16*8;
+
+$tab="%l0";
+
+@T=("%g2","%g3");
+@i=("%g4","%g5");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
+($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
+
+$code.=<<___;
+#include <sparc_arch.h>
+#include <openssl/fipssyms.h>
+
+#ifdef __arch64__
+.register %g2,#scratch
+.register %g3,#scratch
+#endif
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl bn_GF2m_mul_2x2
+.align 16
+bn_GF2m_mul_2x2:
+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+ ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
+
+ andcc %g1, SPARCV9_VIS3, %g0
+ bz,pn %icc,.Lsoftware
+ nop
+
+ sllx %o1, 32, %o1
+ sllx %o3, 32, %o3
+ or %o2, %o1, %o1
+ or %o4, %o3, %o3
+ .word 0x95b262ab ! xmulx %o1, %o3, %o2
+ .word 0x99b262cb ! xmulxhi %o1, %o3, %o4
+ srlx %o2, 32, %o1 ! 13 cycles later
+ st %o2, [%o0+0]
+ st %o1, [%o0+4]
+ srlx %o4, 32, %o3
+ st %o4, [%o0+8]
+ retl
+ st %o3, [%o0+12]
+
+.align 16
+.Lsoftware:
+ save %sp,-STACK_FRAME-$locals,%sp
+
+ sllx %i1,32,$a
+ mov -1,$a12
+ sllx %i3,32,$b
+ or %i2,$a,$a
+ srlx $a12,1,$a48 ! 0x7fff...
+ or %i4,$b,$b
+ srlx $a12,2,$a12 ! 0x3fff...
+ add %sp,STACK_BIAS+STACK_FRAME,$tab
+
+ sllx $a,2,$a4
+ mov $a,$a1
+ sllx $a,1,$a2
+
+ srax $a4,63,@i[1] ! broadcast 61st bit
+ and $a48,$a4,$a4 ! (a<<2)&0x7fff...
+ srlx $a48,2,$a48
+ srax $a2,63,@i[0] ! broadcast 62nd bit
+ and $a12,$a2,$a2 ! (a<<1)&0x3fff...
+ srax $a1,63,$lo ! broadcast 63rd bit
+ and $a48,$a1,$a1 ! (a<<0)&0x1fff...
+
+ sllx $a1,3,$a8
+ and $b,$lo,$lo
+ and $b,@i[0],@i[0]
+ and $b,@i[1],@i[1]
+
+ stx %g0,[$tab+0*8] ! tab[0]=0
+ xor $a1,$a2,$a12
+ stx $a1,[$tab+1*8] ! tab[1]=a1
+ stx $a2,[$tab+2*8] ! tab[2]=a2
+ xor $a4,$a8,$a48
+ stx $a12,[$tab+3*8] ! tab[3]=a1^a2
+ xor $a4,$a1,$a1
+
+ stx $a4,[$tab+4*8] ! tab[4]=a4
+ xor $a4,$a2,$a2
+ stx $a1,[$tab+5*8] ! tab[5]=a1^a4
+ xor $a4,$a12,$a12
+ stx $a2,[$tab+6*8] ! tab[6]=a2^a4
+ xor $a48,$a1,$a1
+ stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
+ xor $a48,$a2,$a2
+
+ stx $a8,[$tab+8*8] ! tab[8]=a8
+ xor $a48,$a12,$a12
+ stx $a1,[$tab+9*8] ! tab[9]=a1^a8
+ xor $a4,$a1,$a1
+ stx $a2,[$tab+10*8] ! tab[10]=a2^a8
+ xor $a4,$a2,$a2
+ stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
+
+ xor $a4,$a12,$a12
+ stx $a48,[$tab+12*8] ! tab[12]=a4^a8
+ srlx $lo,1,$hi
+ stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
+ sllx $lo,63,$lo
+ stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
+ srlx @i[0],2,@T[0]
+ stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
+
+ sllx @i[0],62,$a1
+ sllx $b,3,@i[0]
+ srlx @i[1],3,@T[1]
+ and @i[0],`0xf<<3`,@i[0]
+ sllx @i[1],61,$a2
+ ldx [$tab+@i[0]],@i[0]
+ srlx $b,4-3,@i[1]
+ xor @T[0],$hi,$hi
+ and @i[1],`0xf<<3`,@i[1]
+ xor $a1,$lo,$lo
+ ldx [$tab+@i[1]],@i[1]
+ xor @T[1],$hi,$hi
+
+ xor @i[0],$lo,$lo
+ srlx $b,8-3,@i[0]
+ xor $a2,$lo,$lo
+ and @i[0],`0xf<<3`,@i[0]
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+ sllx @i[1],`$n*4`,@T[0]
+ ldx [$tab+@i[0]],@i[0]
+ srlx @i[1],`64-$n*4`,@T[1]
+ xor @T[0],$lo,$lo
+ srlx $b,`($n+2)*4`-3,@i[1]
+ xor @T[1],$hi,$hi
+ and @i[1],`0xf<<3`,@i[1]
+___
+ push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+ sllx @i[1],`$n*4`,@T[0]
+ ldx [$tab+@i[0]],@i[0]
+ srlx @i[1],`64-$n*4`,@T[1]
+ xor @T[0],$lo,$lo
+
+ sllx @i[0],`($n+1)*4`,@T[0]
+ xor @T[1],$hi,$hi
+ srlx @i[0],`64-($n+1)*4`,@T[1]
+ xor @T[0],$lo,$lo
+ xor @T[1],$hi,$hi
+
+ srlx $lo,32,%i1
+ st $lo,[%i0+0]
+ st %i1,[%i0+4]
+ srlx $hi,32,%i2
+ st $hi,[%i0+8]
+ st %i2,[%i0+12]
+
+ ret
+ restore
+.type bn_GF2m_mul_2x2,#function
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparcv9_modes.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,1691 @@
+#!/usr/bin/env perl
+
+# Specific modes implementations for SPARC Architecture 2011. There
+# is T4 dependency though, an ASI value that is not specified in the
+# Architecture Manual. But as SPARC universe is rather monocultural,
+# we imply that processor capable of executing crypto instructions
+# can handle the ASI in question as well. This means that we ought to
+# keep eyes open when new processors emerge...
+#
+# As for above mentioned ASI. It's so called "block initializing
+# store" which cancels "read" in "read-update-write" on cache lines.
+# This is "cooperative" optimization, as it reduces overall pressure
+# on memory interface. Benefits can't be observed/quantified with
+# usual benchmarks, on the contrary you can notice that single-thread
+# performance for parallelizable modes is ~1.5% worse for largest
+# block sizes [though few percent better for not so long ones]. All
+# this based on suggestions from David Miller.
+
+sub asm_init { # to be called with @ARGV as argument
+ for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
+ if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
+ else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
+}
+
+# unified interface
+my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
+# local variables
+my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
+
+sub alg_cbc_encrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_cbc_encrypt
+.align 32
+${alg}${bits}_t4_cbc_encrypt:
+ save %sp, -$::frame, %sp
+ cmp $len, 0
+ be,pn $::size_t_cc, .L${bits}_cbc_enc_abort
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+ sub $inp, $out, $blk_init ! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+ andcc $ivec, 7, $ivoff
+ alignaddr $ivec, %g0, $ivec
+
+ ldd [$ivec + 0], %f0 ! load ivec
+ bz,pt %icc, 1f
+ ldd [$ivec + 8], %f2
+ ldd [$ivec + 16], %f4
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+1:
+___
+$::code.=<<___ if ($::evp);
+ ld [$ivec + 0], %f0
+ ld [$ivec + 4], %f1
+ ld [$ivec + 8], %f2
+ ld [$ivec + 12], %f3
+___
+$::code.=<<___;
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_enckey
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 127
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
+ brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ alignaddrl $out, %g0, $out
+ srlx $len, 4, $len
+ prefetch [$out], 22
+
+.L${bits}_cbc_enc_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_encrypt_1x
+ add $inp, 16, $inp
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_cbc_enc_loop
+ add $out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3f
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+___
+$::code.=<<___;
+.L${bits}_cbc_enc_abort:
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_enc_loop+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3f
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+ ret
+ restore
+
+.align 16
+3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
+ mov 0xff, $omask
+ srl $omask, $ivoff, $omask
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$ivec + $omask]0xc0
+ std %f6, [$ivec + 8]
+ add $ivec, 16, $ivec
+ orn %g0, $omask, $omask
+ stda %f8, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}cbc_enc_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+
+.L${bits}_cbc_enc_blk_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 5f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+5:
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_encrypt_1x
+ add $inp, 16, $inp
+ sub $len, 1, $len
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ brnz,pt $len, .L${bits}_cbc_enc_blk_loop
+ add $out, 8, $out
+
+ membar #StoreLoad|#StoreStore
+ brnz,pt $blk_init, .L${bits}_cbc_enc_loop
+ mov $blk_init, $len
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3b
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_cbc_encrypt,#function
+.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
+___
+}
+
+sub alg_cbc_decrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_cbc_decrypt
+.align 32
+${alg}${bits}_t4_cbc_decrypt:
+ save %sp, -$::frame, %sp
+ cmp $len, 0
+ be,pn $::size_t_cc, .L${bits}_cbc_dec_abort
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+ sub $inp, $out, $blk_init ! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+ andcc $ivec, 7, $ivoff
+ alignaddr $ivec, %g0, $ivec
+
+ ldd [$ivec + 0], %f12 ! load ivec
+ bz,pt %icc, 1f
+ ldd [$ivec + 8], %f14
+ ldd [$ivec + 16], %f0
+ faligndata %f12, %f14, %f12
+ faligndata %f14, %f0, %f14
+1:
+___
+$::code.=<<___ if ($::evp);
+ ld [$ivec + 0], %f12 ! load ivec
+ ld [$ivec + 4], %f13
+ ld [$ivec + 8], %f14
+ ld [$ivec + 12], %f15
+___
+$::code.=<<___;
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_deckey
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+ srlx $len, 4, $len
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_cbc_dec_loop2x
+ prefetch [$out], 22
+.L${bits}_cbc_dec_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g4, %o0, %o2 ! ^= rk[0]
+ xor %g5, %o1, %o3
+ movxtod %o2, %f0
+ movxtod %o3, %f2
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_decrypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
+ add $out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+.L${bits}_cbc_dec_abort:
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_cbc_dec_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ xor %g4, %o0, %o4 ! ^= rk[0]
+ xor %g5, %o1, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ xor %g4, %o2, %o4
+ xor %g5, %o3, %o5
+ movxtod %o4, %f4
+ movxtod %o5, %f6
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_decrypt_2x
+ add $inp, 32, $inp
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
+ add $out, 32, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f6, %f6
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f0, [$out + 8]
+ std %f2, [$out + 16]
+ std %f4, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f6, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+ ret
+ restore
+
+.align 16
+.L${bits}_cbc_dec_unaligned_ivec:
+ alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
+ mov 0xff, $omask
+ srl $omask, $ivoff, $omask
+ faligndata %f12, %f12, %f0
+ faligndata %f12, %f14, %f2
+ faligndata %f14, %f14, %f4
+ stda %f0, [$ivec + $omask]0xc0
+ std %f2, [$ivec + 8]
+ add $ivec, 16, $ivec
+ orn %g0, $omask, $omask
+ stda %f4, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}cbc_dec_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_cbc_dec_blk_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ xor %g4, %o0, %o4 ! ^= rk[0]
+ xor %g5, %o1, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ xor %g4, %o2, %o4
+ xor %g5, %o3, %o5
+ movxtod %o4, %f4
+ movxtod %o5, %f6
+
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_decrypt_2x
+ add $inp, 32, $inp
+ subcc $len, 2, $len
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_cbc_dec_loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_cbc_dec_loop2x
+ nop
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0] ! write out ivec
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3b
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_cbc_decrypt,#function
+.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
+___
+}
+
+sub alg_ctr32_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_ctr32_encrypt
+.align 32
+${alg}${bits}_t4_ctr32_encrypt:
+ save %sp, -$::frame, %sp
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_enckey
+ sllx $len, 4, $len
+
+ ld [$ivec + 0], %l4 ! counter
+ ld [$ivec + 4], %l5
+ ld [$ivec + 8], %l6
+ ld [$ivec + 12], %l7
+
+ sllx %l4, 32, %o5
+ or %l5, %o5, %o5
+ sllx %l6, 32, %g1
+ xor %o5, %g4, %g4 ! ^= rk[0]
+ xor %g1, %g5, %g5
+ movxtod %g4, %f14 ! most significant 64 bits
+
+ sub $inp, $out, $blk_init ! $inp!=$out
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_ctr32_loop2x
+ srlx $len, 4, $len
+.L${bits}_ctr32_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f4
+ aes_eround23 %f18, %f14, %f2, %f2
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f18, %f14, %f2, %f0
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_1x+8
+ add $inp, 16, $inp
+
+ movxtod %o0, %f10
+ movxtod %o1, %f12
+ fxor %f10, %f0, %f0 ! ^= inp
+ fxor %f12, %f2, %f2
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_ctr32_loop2x
+ add $out, 16, $out
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
+ orn %g0, $omask, $omask
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_ctr32_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ xor %g5, %l7, %g1
+ add %l7, 1, %l7
+ movxtod %g1, %f6
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f8
+ aes_eround23 %f18, %f14, %f2, %f2
+ aes_eround01 %f16, %f14, %f6, %f10
+ aes_eround23 %f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f16, %f6, %f14, %f6
+ camellia_f %f18, %f14, %f2, %f0
+ camellia_f %f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_2x+16
+ add $inp, 32, $inp
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ movxtod %o2, %f12
+ fxor %f8, %f0, %f0 ! ^= inp
+ movxtod %o3, %f8
+ fxor %f10, %f2, %f2
+ fxor %f12, %f4, %f4
+ fxor %f8, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_ctr32_loop2x
+ add $out, 32, $out
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f6, %f6
+
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f0, [$out + 8]
+ std %f2, [$out + 16]
+ std %f4, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f6, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
+ orn %g0, $omask, $omask
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_ctr32_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_ctr32_blk_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ xor %g5, %l7, %g1
+ add %l7, 1, %l7
+ movxtod %g1, %f6
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f8
+ aes_eround23 %f18, %f14, %f2, %f2
+ aes_eround01 %f16, %f14, %f6, %f10
+ aes_eround23 %f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f16, %f6, %f14, %f6
+ camellia_f %f18, %f14, %f2, %f0
+ camellia_f %f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_2x+16
+ add $inp, 32, $inp
+ subcc $len, 2, $len
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ movxtod %o2, %f12
+ fxor %f8, %f0, %f0 ! ^= inp
+ movxtod %o3, %f8
+ fxor %f10, %f2, %f2
+ fxor %f12, %f4, %f4
+ fxor %f8, %f6, %f6
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_ctr32_loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_ctr32_loop2x
+ nop
+
+ ret
+ restore
+.type ${alg}${bits}_t4_ctr32_encrypt,#function
+.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
+___
+}
+
+sub alg_xts_implement {
+my ($alg,$bits,$dir) = @_;
+my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
+my $rem=$ivec;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_xts_${dir}crypt
+.align 32
+${alg}${bits}_t4_xts_${dir}crypt:
+ save %sp, -$::frame-16, %sp
+ srln $len, 0, $len ! needed on v8+, "nop" on v9
+
+ mov $ivec, %o0
+ add %fp, $::bias-16, %o1
+ call ${alg}_t4_encrypt
+ mov $key2, %o2
+
+ add %fp, $::bias-16, %l7
+ ldxa [%l7]0x88, %g2
+ add %fp, $::bias-8, %l7
+ ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
+
+ sethi %hi(0x76543210), %l7
+ or %l7, %lo(0x76543210), %l7
+ bmask %l7, %g0, %g0 ! byte swap mask
+
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_${dir}ckey
+ and $len, 15, $rem
+ and $len, -16, $len
+___
+$code.=<<___ if ($dir eq "de");
+ mov 0, %l7
+ movrnz $rem, 16, %l7
+ sub $len, %l7, $len
+___
+$code.=<<___;
+
+ sub $inp, $out, $blk_init ! $inp!=$out
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+___
+$code.=<<___ if ($dir eq "de");
+ brz,pn $len, .L${bits}_xts_${dir}steal
+___
+$code.=<<___;
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_xts_${dir}loop2x
+ srlx $len, 4, $len
+.L${bits}_xts_${dir}loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_${dir}crypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
+ add $out, 16, $out
+
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
+ orn %g0, $omask, $omask
+
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_xts_${dir}loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ xor %g4, %o2, %o2 ! ^= rk[0]
+ xor %g5, %o3, %o3
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+ movxtod %o2, %f4
+ movxtod %o3, %f6
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4 ! ^= tweak[0]
+ fxor %f10, %f6, %f6
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_${dir}crypt_2x
+ add $inp, 32, $inp
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
+ add $out, 32, $out
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f10
+ faligndata %f2, %f4, %f12
+ faligndata %f4, %f6, %f14
+ faligndata %f6, %f6, %f0
+
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f10, [$out + 8]
+ std %f12, [$out + 16]
+ std %f14, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f0, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
+ orn %g0, $omask, $omask
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_xts_${dir}blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_xts_${dir}blk2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ xor %g4, %o2, %o2 ! ^= rk[0]
+ xor %g5, %o3, %o3
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+ movxtod %o2, %f4
+ movxtod %o3, %f6
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4 ! ^= tweak[0]
+ fxor %f10, %f6, %f6
+
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_${dir}crypt_2x
+ add $inp, 32, $inp
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ subcc $len, 2, $len
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_xts_${dir}loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_xts_${dir}loop2x
+ nop
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+___
+$code.=<<___ if ($dir eq "en");
+.align 32
+.L${bits}_xts_${dir}steal:
+ std %f0, [%fp + $::bias-16] ! copy of output
+ std %f2, [%fp + $::bias-8]
+
+ srl $ileft, 3, $ileft
+ add %fp, $::bias-16, %l7
+ add $inp, $ileft, $inp ! original $inp+$len&-15
+ add $out, $ooff, $out ! original $out+$len&-15
+ mov 0, $ileft
+ nop ! align
+
+.L${bits}_xts_${dir}stealing:
+ ldub [$inp + $ileft], %o0
+ ldub [%l7 + $ileft], %o1
+ dec $rem
+ stb %o0, [%l7 + $ileft]
+ stb %o1, [$out + $ileft]
+ brnz $rem, .L${bits}_xts_${dir}stealing
+ inc $ileft
+
+ mov %l7, $inp
+ sub $out, 16, $out
+ mov 0, $ileft
+ sub $out, $ooff, $out
+ ba .L${bits}_xts_${dir}loop ! one more time
+ mov 1, $len ! $rem is 0
+___
+$code.=<<___ if ($dir eq "de");
+.align 32
+.L${bits}_xts_${dir}steal:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 8f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+8:
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %o2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %o3
+ xor %l7, %o2, %o2
+
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ call _${alg}${bits}_${dir}crypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ std %f0, [%fp + $::bias-16]
+ std %f2, [%fp + $::bias-8]
+
+ srl $ileft, 3, $ileft
+ add %fp, $::bias-16, %l7
+ add $inp, $ileft, $inp ! original $inp+$len&-15
+ add $out, $ooff, $out ! original $out+$len&-15
+ mov 0, $ileft
+ add $out, 16, $out
+ nop ! align
+
+.L${bits}_xts_${dir}stealing:
+ ldub [$inp + $ileft], %o0
+ ldub [%l7 + $ileft], %o1
+ dec $rem
+ stb %o0, [%l7 + $ileft]
+ stb %o1, [$out + $ileft]
+ brnz $rem, .L${bits}_xts_${dir}stealing
+ inc $ileft
+
+ mov %l7, $inp
+ sub $out, 16, $out
+ mov 0, $ileft
+ sub $out, $ooff, $out
+ ba .L${bits}_xts_${dir}loop ! one more time
+ mov 1, $len ! $rem is 0
+___
+$code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_xts_${dir}crypt,#function
+.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
+___
+}
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %visopf = ( "faligndata" => 0x048,
+ "bshuffle" => 0x04c,
+ "fnot2" => 0x066,
+ "fxor" => 0x06c,
+ "fsrc2" => 0x078 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "umulxhi" => 0x016,
+ "alignaddr" => 0x018,
+ "bmask" => 0x019,
+ "alignaddrl" => 0x01a );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unaes_round { # 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = ( "aes_eround01" => 0,
+ "aes_eround23" => 1,
+ "aes_dround01" => 2,
+ "aes_dround23" => 3,
+ "aes_eround01_l"=> 4,
+ "aes_eround23_l"=> 5,
+ "aes_dround01_l"=> 6,
+ "aes_dround23_l"=> 7,
+ "aes_kexpand1" => 8 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unaes_kexpand { # 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = ( "aes_kexpand0" => 0x130,
+ "aes_kexpand2" => 0x131 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub uncamellia_f { # 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if (1) {
+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub uncamellia3 { # 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %cmllopf = ( "camellia_fl" => 0x13c,
+ "camellia_fli" => 0x13d );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if (defined($opf=$cmllopf{$mnemonic})) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unmovxtox { # 2-argument instructions
+my ($mnemonic,$rs,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
+my ($ref,$opf);
+my %movxopf = ( "movdtox" => 0x110,
+ "movstouw" => 0x111,
+ "movstosw" => 0x113,
+ "movxtod" => 0x118,
+ "movwtos" => 0x119 );
+
+ $ref = "$mnemonic\t$rs,$rd";
+
+ if (defined($opf=$movxopf{$mnemonic})) {
+ foreach ($rs,$rd) {
+ return $ref if (!/%([fgoli])([0-9]{1,2})/);
+ $_=$bias{$1}+$2;
+ if ($2>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($2|$2>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub undes {
+my ($mnemonic)=shift;
+my @args=@_;
+my ($ref,$opf);
+my %desopf = ( "des_round" => 0b1001,
+ "des_ip" => 0b100110100,
+ "des_iip" => 0b100110101,
+ "des_kexpand" => 0b100110110 );
+
+ $ref = "$mnemonic\t".join(",",@_);
+
+ if (defined($opf=$desopf{$mnemonic})) { # 4-arg
+ if ($mnemonic eq "des_round") {
+ foreach (@args[0..3]) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
+ $ref;
+ } elsif ($mnemonic eq "des_kexpand") { # 3-arg
+ foreach (@args[0..2]) {
+ return $ref if (!/(%f)?([0-9]{1,2})/);
+ $_=$2;
+ if ($2>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($2|$2>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
+ $ref;
+ } else { # 2-arg
+ foreach (@args[0..1]) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
+ $ref;
+ }
+ } else {
+ return $ref;
+ }
+}
+
+sub emit_assembler {
+ foreach (split("\n",$::code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
+
+ s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+ &unaes_round($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unaes_kexpand($1,$2,$3,$4)
+ /geo or
+ s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+ &uncamellia_f($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &uncamellia3($1,$2,$3,$4)
+ /geo or
+ s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
+ &undes($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
+ &unmovxtox($1,$2,$3)
+ /geo or
+ s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
+ &unmovxtox($1,$2,$3)
+ /geo or
+ s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /geo or
+ s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /geo;
+
+ print $_,"\n";
+ }
+}
+
+1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/vis3-mont.pl Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,375 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2012.
+#
+# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# onward. There are three new instructions used here: umulxhi,
+# addxc[cc] and initializing store. On T3 RSA private key operations
+# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
+# lengths. This is without dedicated squaring procedure. On T4
+# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
+# for reference purposes, because T4 has dedicated Montgomery
+# multiplication and squaring *instructions* that deliver even more.
+
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64) { $bias=2047; $frame=192; }
+else { $bias=0; $frame=112; }
+
+$code.=<<___ if ($bits==64);
+.register %g2,#scratch
+.register %g3,#scratch
+___
+$code.=<<___;
+#include <openssl/fipssyms.h>
+
+.section ".text",#alloc,#execinstr
+___
+
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+ (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0"; # BN_ULONG *rp,
+$ap="%o1"; # const BN_ULONG *ap,
+$bp="%o2"; # const BN_ULONG *bp,
+$np="%o3"; # const BN_ULONG *np,
+$n0p="%o4"; # const BN_ULONG *n0,
+$num="%o5"; # int num); # caller ensures that num is even
+ # and >=6
+$code.=<<___;
+.globl bn_mul_mont_vis3
+.align 32
+bn_mul_mont_vis3:
+ add %sp, $bias, %g4 ! real top of stack
+ sll $num, 2, $num ! size in bytes
+ add $num, 63, %g5
+ andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
+ add %g5, %g5, %g1
+ add %g5, %g1, %g1 ! 3*buffer size
+ sub %g4, %g1, %g1
+ andn %g1, 63, %g1 ! align at 64 byte
+ sub %g1, $frame, %g1 ! new top of stack
+ sub %g1, %g4, %g1
+
+ save %sp, %g1, %sp
+___
+
+# +-------------------------------+<----- %sp
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 tmp[0] |
+# +-------------------------------+
+# . .
+# . .
+# +-------------------------------+<----- aligned at 64 bytes
+# | __int64 ap[1..0] | converted ap[]
+# +-------------------------------+
+# | __int64 np[1..0] | converted np[]
+# +-------------------------------+
+# | __int64 ap[3..2] |
+# . .
+# . .
+# +-------------------------------+
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+ ld [$n0p+0], $t0 ! pull n0[0..1] value
+ add %sp, $bias+$frame, $tp
+ ld [$n0p+4], $t1
+ add $tp, %g5, $anp
+ ld [$bp+0], $t2 ! m0=bp[0]
+ sllx $t1, 32, $n0
+ ld [$bp+4], $t3
+ or $t0, $n0, $n0
+ add $bp, 8, $bp
+
+ ld [$ap+0], $t0 ! ap[0]
+ sllx $t3, 32, $m0
+ ld [$ap+4], $t1
+ or $t2, $m0, $m0
+
+ ld [$ap+8], $t2 ! ap[1]
+ sllx $t1, 32, $aj
+ ld [$ap+12], $t3
+ or $t0, $aj, $aj
+ add $ap, 16, $ap
+ stx $aj, [$anp] ! converted ap[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
+ umulxhi $aj, $m0, $hi0
+
+ ld [$np+0], $t0 ! np[0]
+ sllx $t3, 32, $aj
+ ld [$np+4], $t1
+ or $t2, $aj, $aj
+
+ ld [$np+8], $t2 ! np[1]
+ sllx $t1, 32, $nj
+ ld [$np+12], $t3
+ or $t0, $nj, $nj
+ add $np, 16, $np
+ stx $nj, [$anp+8] ! converted np[0]
+
+ mulx $lo0, $n0, $m1 ! "tp[0]"*n0
+ stx $aj, [$anp+16] ! converted ap[1]
+
+ mulx $aj, $m0, $alo ! ap[1]*bp[0]
+ umulxhi $aj, $m0, $aj ! ahi=aj
+
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+
+ sllx $t3, 32, $nj
+ or $t2, $nj, $nj
+ stx $nj, [$anp+24] ! converted np[1]
+ add $anp, 32, $anp
+
+ addcc $lo0, $lo1, $lo1
+ addxc %g0, $hi1, $hi1
+
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .L1st
+ sub $num, 24, $cnt ! cnt=num-3
+
+.align 16
+.L1st:
+ ld [$ap+0], $t0 ! ap[j]
+ addcc $alo, $hi0, $lo0
+ ld [$ap+4], $t1
+ addxc $aj, %g0, $hi0
+
+ sllx $t1, 32, $aj
+ add $ap, 8, $ap
+ or $t0, $aj, $aj
+ stx $aj, [$anp] ! converted ap[j]
+
+ ld [$np+0], $t2 ! np[j]
+ addcc $nlo, $hi1, $lo1
+ ld [$np+4], $t3
+ addxc $nj, %g0, $hi1 ! nhi=nj
+
+ sllx $t3, 32, $nj
+ add $np, 8, $np
+ mulx $aj, $m0, $alo ! ap[j]*bp[0]
+ or $t2, $nj, $nj
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ stx $nj, [$anp+8] ! converted np[j]
+ add $anp, 16, $anp ! anp++
+
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp ! tp++
+
+ brnz,pt $cnt, .L1st
+ sub $cnt, 8, $cnt ! j--
+!.L1st
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1
+ addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+
+ addcc $hi0, $hi1, $hi1
+ addxc %g0, %g0, $ovf ! upmost overflow bit
+ stx $hi1, [$tp]
+ add $tp, 8, $tp
+
+ ba .Louter
+ sub $num, 16, $i ! i=num-2
+
+.align 16
+.Louter:
+ ld [$bp+0], $t2 ! m0=bp[i]
+ ld [$bp+4], $t3
+
+ sub $anp, $num, $anp ! rewind
+ sub $tp, $num, $tp
+ sub $anp, $num, $anp
+
+ add $bp, 8, $bp
+ sllx $t3, 32, $m0
+ ldx [$anp+0], $aj ! ap[0]
+ or $t2, $m0, $m0
+ ldx [$anp+8], $nj ! np[0]
+
+ mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
+ ldx [$tp], $tj ! tp[0]
+ umulxhi $aj, $m0, $hi0
+ ldx [$anp+16], $aj ! ap[1]
+ addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
+ mulx $aj, $m0, $alo ! ap[1]*bp[i]
+ addxc %g0, $hi0, $hi0
+ mulx $lo0, $n0, $m1 ! tp[0]*n0
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ mulx $nj, $m1, $lo1 ! np[0]*m1
+ umulxhi $nj, $m1, $hi1
+ ldx [$anp+24], $nj ! np[1]
+ add $anp, 32, $anp
+ addcc $lo1, $lo0, $lo1
+ mulx $nj, $m1, $nlo ! np[1]*m1
+ addxc %g0, $hi1, $hi1
+ umulxhi $nj, $m1, $nj ! nhi=nj
+
+ ba .Linner
+ sub $num, 24, $cnt ! cnt=num-3
+.align 16
+.Linner:
+ addcc $alo, $hi0, $lo0
+ ldx [$tp+8], $tj ! tp[j]
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ ldx [$anp+0], $aj ! ap[j]
+ addcc $nlo, $hi1, $lo1
+ mulx $aj, $m0, $alo ! ap[j]*bp[i]
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ ldx [$anp+8], $nj ! np[j]
+ add $anp, 16, $anp
+ umulxhi $aj, $m0, $aj ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ mulx $nj, $m1, $nlo ! np[j]*m1
+ addxc %g0, $hi0, $hi0
+ umulxhi $nj, $m1, $nj ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+ add $tp, 8, $tp
+ brnz,pt $cnt, .Linner
+ sub $cnt, 8, $cnt
+!.Linner
+ ldx [$tp+8], $tj ! tp[j]
+ addcc $alo, $hi0, $lo0
+ addxc $aj, %g0, $hi0 ! ahi=aj
+ addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi0, $hi0
+
+ addcc $nlo, $hi1, $lo1
+ addxc $nj, %g0, $hi1 ! nhi=nj
+ addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
+ addxc %g0, $hi1, $hi1
+ stx $lo1, [$tp] ! tp[j-1]
+
+ subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
+ addxccc $hi1, $hi0, $hi1
+ addxc %g0, %g0, $ovf
+ stx $hi1, [$tp+8]
+ add $tp, 16, $tp
+
+ brnz,pt $i, .Louter
+ sub $i, 8, $i
+
+ sub $anp, $num, $anp ! rewind
+ sub $tp, $num, $tp
+ sub $anp, $num, $anp
+ ba .Lsub
+ subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
+
+.align 16
+.Lsub:
+ ldx [$tp], $tj
+ add $tp, 8, $tp
+ ldx [$anp+8], $nj
+ add $anp, 16, $anp
+ subccc $tj, $nj, $t2 ! tp[j]-np[j]
+ srlx $tj, 32, $tj
+ srlx $nj, 32, $nj
+ subccc $tj, $nj, $t3
+ add $rp, 8, $rp
+ st $t2, [$rp-4] ! reverse order
+ st $t3, [$rp-8]
+ brnz,pt $cnt, .Lsub
+ sub $cnt, 8, $cnt
+
+ sub $anp, $num, $anp ! rewind
+ sub $tp, $num, $tp
+ sub $anp, $num, $anp
+ sub $rp, $num, $rp
+
+ subc $ovf, %g0, $ovf ! handle upmost overflow bit
+ and $tp, $ovf, $ap
+ andn $rp, $ovf, $np
+ or $np, $ap, $ap ! ap=borrow?tp:rp
+ ba .Lcopy
+ sub $num, 8, $cnt
+
+.align 16
+.Lcopy: ! copy or in-place refresh
+ ld [$ap+0], $t2
+ ld [$ap+4], $t3
+ add $ap, 8, $ap
+ stx %g0, [$tp] ! zap
+ add $tp, 8, $tp
+ stx %g0, [$anp] ! zap
+ stx %g0, [$anp+8]
+ add $anp, 16, $anp
+ st $t3, [$rp+0] ! flip order
+ st $t2, [$rp+4]
+ add $rp, 8, $rp
+ brnz $cnt, .Lcopy
+ sub $cnt, 8, $cnt
+
+ mov 1, %o0
+ ret
+ restore
+.type bn_mul_mont_vis3, #function
+.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
+.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "umulxhi" => 0x016 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/patches/302-t4-inline.patch Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,710 @@
+#
+# This file adds inline T4 instruction support to OpenSSL upstream code.
+# The change was brought in from OpenSSL 1.0.2.
+#
+Index: Configure
+===================================================================
+diff -ru openssl-1.0.1e/Configure openssl-1.0.1e/Configure
+--- a/Configure 2011-05-24 17:02:24.000000000 -0700
++++ b/Configure 2011-07-27 10:48:17.817470000 -0700
+@@ -129,7 +129,7 @@
+
+ my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o";
+ my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
+-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
+ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
+ my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
+Index: crypto/sparccpuid.S
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sparccpuid.S openssl-1.0.1e/crypto/sparccpuid.S
+--- a/crypto/sparccpuid.S 2011-05-24 17:02:24.000000000 -0700
++++ b/crypto/sparccpuid.S 2011-07-27 10:48:17.817470000 -0700
+@@ -255,7 +255,12 @@
+ ! UltraSPARC IIe 7
+ ! UltraSPARC III 7
+ ! UltraSPARC T1 24
++! SPARC T4 65(*)
+ !
++! (*) result has lesser to do with VIS instruction latencies, rdtick
++! appears that slow, but it does the trick in sense that FP and
++! VIS code paths are still slower than integer-only ones.
++!
+ ! Numbers for T2 and SPARC64 V-VII are more than welcomed.
+ !
+ ! It would be possible to detect specifically US-T1 by instrumenting
+@@ -264,6 +269,8 @@
+ .global _sparcv9_vis1_instrument
+ .align 8
+ _sparcv9_vis1_instrument:
++ .word 0x81b00d80 !fxor %f0,%f0,%f0
++ .word 0x85b08d82 !fxor %f2,%f2,%f2
+ .word 0x91410000 !rd %tick,%o0
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
+ .word 0x85b08d82 !fxor %f2,%f2,%f2
+Index: crypto/sparcv9cap.c
+===================================================================
+--- openssl-fips-2.0.13/crypto/sparcv9cap.c.~1~ 2016-06-20 12:49:42.000000000 -0700
++++ openssl-fips-2.0.13/crypto/sparcv9cap.c 2016-09-08 14:37:20.252604855 -0700
+@@ -4,41 +4,81 @@
+ #include <setjmp.h>
+ #include <signal.h>
+ #include <sys/time.h>
++#include <unistd.h>
+ #include <openssl/bn.h>
++#include "sparc_arch.h"
+
+-#define SPARCV9_TICK_PRIVILEGED (1<<0)
+-#define SPARCV9_PREFER_FPU (1<<1)
+-#define SPARCV9_VIS1 (1<<2)
+-#define SPARCV9_VIS2 (1<<3) /* reserved */
+-#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */
+-#define SPARCV9_BLK (1<<5) /* VIS1 block copy */
++#if defined(__GNUC__) && defined(__linux)
++__attribute__((visibility("hidden")))
++#endif
+
+-static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
++extern unsigned OPENSSL_sparcv9cap_P[2];
+
+ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+- {
+- int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+- int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
++{
++ int bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
++ int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
++ int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+
+- if (num>=8 && !(num&1) &&
+- (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+- (SPARCV9_PREFER_FPU|SPARCV9_VIS1))
+- return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
+- else
+- return bn_mul_mont_int(rp,ap,bp,np,n0,num);
+- }
++ if (!(num & 1) && num >= 6) {
++ if ((num & 15) == 0 && num <= 64 &&
++ (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
++ (CFR_MONTMUL | CFR_MONTSQR)) {
++ typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
++ const BN_ULONG *bp,
++ const BN_ULONG *np,
++ const BN_ULONG *n0);
++ int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap,
++ const BN_ULONG *bp, const BN_ULONG *np,
++ const BN_ULONG *n0);
++ int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
++ const BN_ULONG *bp, const BN_ULONG *np,
++ const BN_ULONG *n0);
++ int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
++ const BN_ULONG *bp, const BN_ULONG *np,
++ const BN_ULONG *n0);
++ int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
++ const BN_ULONG *bp, const BN_ULONG *np,
++ const BN_ULONG *n0);
++ static const bn_mul_mont_f funcs[4] = {
++ bn_mul_mont_t4_8, bn_mul_mont_t4_16,
++ bn_mul_mont_t4_24, bn_mul_mont_t4_32
++ };
++ bn_mul_mont_f worker = funcs[num / 16 - 1];
+
++ if ((*worker) (rp, ap, bp, np, n0))
++ return 1;
++ /* retry once and fall back */
++ if ((*worker) (rp, ap, bp, np, n0))
++ return 1;
++ return bn_mul_mont_vis3(rp, ap, bp, np, n0, num);
++ }
++ if ((OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3))
++ return bn_mul_mont_vis3(rp, ap, bp, np, n0, num);
++ else if (num >= 8 &&
++ (OPENSSL_sparcv9cap_P[0] &
++ (SPARCV9_PREFER_FPU | SPARCV9_VIS1)) ==
++ (SPARCV9_PREFER_FPU | SPARCV9_VIS1))
++ return bn_mul_mont_fpu(rp, ap, bp, np, n0, num);
++ }
++ return bn_mul_mont_int(rp, ap, bp, np, n0, num);
++}
++
++
+ unsigned long _sparcv9_rdtick(void);
+ void _sparcv9_vis1_probe(void);
+ unsigned long _sparcv9_vis1_instrument(void);
+ void _sparcv9_vis2_probe(void);
+ void _sparcv9_fmadd_probe(void);
++unsigned long _sparcv9_rdcfr(void);
++void _sparcv9_vis3_probe(void);
++unsigned long _sparcv9_random(void);
+ size_t _sparcv9_vis1_instrument_bus(unsigned int *,size_t);
+ size_t _sparcv8_vis1_instrument_bus2(unsigned int *,size_t,size_t);
+
+ unsigned long OPENSSL_rdtsc(void)
+ {
+- if (OPENSSL_sparcv9cap_P&SPARCV9_TICK_PRIVILEGED)
++ if (OPENSSL_sparcv9cap_P[0]&SPARCV9_TICK_PRIVILEGED)
+ #if defined(__sun) && defined(__SVR4)
+ return gethrtime();
+ #else
+@@ -50,7 +90,7 @@
+
+ size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt)
+ {
+- if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++ if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+ SPARCV9_BLK)
+ return _sparcv9_vis1_instrument_bus(out,cnt);
+ else
+@@ -59,7 +99,7 @@
+
+ size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max)
+ {
+- if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++ if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+ SPARCV9_BLK)
+ return _sparcv9_vis1_instrument_bus2(out,cnt,max);
+ else
+@@ -120,7 +160,9 @@
+
+ if ((e=getenv("OPENSSL_sparcv9cap")))
+ {
+- OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
++ OPENSSL_sparcv9cap_P[0]=strtoul(e,NULL,0);
++ if ((e = strchr(e, ':')))
++ OPENSSL_sparcv9cap_P[1] = strtoul(e + 1, NULL, 0);
+ return;
+ }
+
+@@ -128,17 +170,17 @@
+ {
+ if (strcmp(si,"sun4v"))
+ /* FPU is preferred for all CPUs, but US-T1/2 */
+- OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU;
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_PREFER_FPU;
+ }
+
+ if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
+ {
+ if (strstr(si,"+vis"))
+- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1|SPARCV9_BLK;
+ if (strstr(si,"+vis2"))
+ {
+- OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+- OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
++ OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ return;
+ }
+ }
+@@ -198,12 +240,14 @@
+
+ if ((e=getenv("OPENSSL_sparcv9cap")))
+ {
+- OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
++ OPENSSL_sparcv9cap_P[0]=strtoul(e,NULL,0);
++ if ((e = strchr(e, ':')))
++ OPENSSL_sparcv9cap_P[1] = strtoul(e + 1, NULL, 0);
+ return;
+ }
+
+ /* Initial value, fits UltraSPARC-I&II... */
+- OPENSSL_sparcv9cap_P = SPARCV9_PREFER_FPU|SPARCV9_TICK_PRIVILEGED;
++ OPENSSL_sparcv9cap_P[0] = SPARCV9_PREFER_FPU|SPARCV9_TICK_PRIVILEGED;
+
+ sigfillset(&all_masked);
+ sigdelset(&all_masked,SIGILL);
+@@ -226,20 +270,20 @@
+ if (sigsetjmp(common_jmp,1) == 0)
+ {
+ _sparcv9_rdtick();
+- OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++ OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ }
+
+ if (sigsetjmp(common_jmp,1) == 0)
+ {
+ _sparcv9_vis1_probe();
+- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1|SPARCV9_BLK;
+ /* detect UltraSPARC-Tx, see sparccpud.S for details... */
+ if (_sparcv9_vis1_instrument() >= 12)
+- OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
++ OPENSSL_sparcv9cap_P[0] &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
+ else
+ {
+ _sparcv9_vis2_probe();
+- OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
+ }
+ }
+
+@@ -246,13 +290,49 @@
+ if (sigsetjmp(common_jmp,1) == 0)
+ {
+ _sparcv9_fmadd_probe();
+- OPENSSL_sparcv9cap_P |= SPARCV9_FMADD;
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_FMADD;
+ }
+
++ /*
++ * VIS3 flag is tested independently from VIS1, unlike VIS2 that is,
++ * because VIS3 defines even integer instructions.
++ */
++ if (sigsetjmp(common_jmp,1) == 0) {
++ _sparcv9_vis3_probe();
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
++ }
++
++ if (sigsetjmp(common_jmp,1) == 0) {
++ (void)_sparcv9_random();
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_RANDOM;
++ }
++
++ /*
++ * In wait for better solution _sparcv9_rdcfr is masked by
++ * VIS3 flag, because it goes to uninterruptable endless
++ * loop on UltraSPARC II running Solaris. Things might be
++ * different on Linux...
++ */
++ if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) &&
++ sigsetjmp(common_jmp, 1) == 0) {
++ OPENSSL_sparcv9cap_P[1] = (unsigned int)_sparcv9_rdcfr();
++ }
++
+ sigaction(SIGBUS,&bus_oact,NULL);
+ sigaction(SIGILL,&ill_oact,NULL);
+
+ sigprocmask(SIG_SETMASK,&oset,NULL);
++
++ if (sizeof(size_t) == 8)
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
++#ifdef __linux
++ else {
++ int ret = syscall(340);
++
++ if (ret >= 0 && ret & 1)
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
+ }
++#endif
++ }
+
+ #endif
+Index: crypto/sha/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sha/Makefile openssl-1.0.1e/crypto/sha/Makefile
+--- a/crypto/sha/Makefile 2011-05-24 17:02:24.000000000 -0700
++++ b/crypto/sha/Makefile 2011-07-27 10:48:17.817470000 -0700
+@@ -66,9 +66,9 @@
+ sha1-x86_64.s: asm/sha1-x86_64.pl; $(PERL) asm/sha1-x86_64.pl $(PERLASM_SCHEME) > $@
+ sha256-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
+ sha512-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
+-sha1-sparcv9.s: asm/sha1-sparcv9.pl; $(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS)
+-sha256-sparcv9.s:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
+-sha512-sparcv9.s:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
++sha1-sparcv9.S: asm/sha1-sparcv9.pl; $(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS)
++sha256-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
++sha512-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
+
+ sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@
+ sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
+Index: crypto/des/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/des/Makefile.orig openssl-1.0.1e/crypto/des/Makefile
+--- a/crypto/des/Makefile
++++ b/crypto/des/Makefile
+@@ -61,6 +61,10 @@ des: des.o cbc3_enc.o lib
+
+ des_enc-sparc.S: asm/des_enc.m4
+ m4 -B 8192 asm/des_enc.m4 > des_enc-sparc.S
++dest4-sparcv9.S: asm/dest4-sparcv9.pl
++ $(PERL) asm/dest4-sparcv9.pl $(CFLAGS) > $@
++aest4-sparcv9.o: aest4-sparcv9.S
++ $(AS) $(ASFLAGS) -Wa,-n -o $@ $^
+
+ des-586.s: asm/des-586.pl ../perlasm/x86asm.pl ../perlasm/cbc.pl
+ $(PERL) asm/des-586.pl $(PERLASM_SCHEME) $(CFLAGS) > $@
+Index: openssl/crypto/bn/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/bn/Makefile openssl-1.0.1e/crypto/bn/Makefile.new
+--- openssl-1.0.1e/crypto/bn/Makefile 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/bn/Makefile 2011-07-27 10:48:17.817470000 -0700
+@@ -77,6 +77,16 @@
+ $(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@
+ sparcv9-mont.s: asm/sparcv9-mont.pl
+ $(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@
++vis3-mont.S: asm/vis3-mont.pl
++ $(PERL) asm/vis3-mont.pl $(CFLAGS) > $@
++vis3-mont.o: vis3-mont.S
++ $(AS) $(ASFLAGS) -Wa,-n -o $@ $^
++sparct4-mont.S: asm/sparct4-mont.pl
++ $(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@
++sparct4-mont.o: sparct4-mont.S
++ $(CC) $(CFLAGS) -Wa,-n -c -o $@ $^
++sparcv9-gf2m.S: asm/sparcv9-gf2m.pl
++ $(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@
+
+ bn-mips3.o: asm/mips3.s
+ @if [ "$(CC)" = "gcc" ]; then \
+Index: openssl/crypto/aes/Makefile
+===================================================================
+--- Makefile Thu May 2 13:42:37 2013
++++ Makefile.orig Thu May 2 13:41:51 2013
+@@ -69,6 +69,11 @@
+ aes-sparcv9.s: asm/aes-sparcv9.pl
+ $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
+
++aest4-sparcv9.S: asm/aest4-sparcv9.pl
++ $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@
++aest4-sparcv9.o: aest4-sparcv9.S
++ $(AS) $(ASFLAGS) -Wa,-n -o $@ $^
++
+ aes-ppc.s: asm/aes-ppc.pl
+ $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
+ aesp8-ppc.s: asm/aesp8-ppc.pl
+Index: openssl/crypto/evp/evp.h
+===================================================================
+--- evp.h Mon Feb 11 07:26:04 2013
++++ evp.h.new Thu May 2 14:31:55 2013
+@@ -1282,6 +1282,7 @@
+ #define EVP_F_AESNI_INIT_KEY 165
+ #define EVP_F_AESNI_XTS_CIPHER 176
+ #define EVP_F_AES_INIT_KEY 133
++#define EVP_F_AES_T4_INIT_KEY 178
+ #define EVP_F_AES_XTS 172
+ #define EVP_F_AES_XTS_CIPHER 175
+ #define EVP_F_CAMELLIA_INIT_KEY 159
+Index: openssl/crypto/bn/bn_exp.c
+===================================================================
+--- bn_exp.c.orig 2016-09-09 14:46:47.271555005 -0700
++++ bn_exp.c 2016-09-09 16:04:47.477700835 -0700
+@@ -124,8 +124,15 @@
+ # ifndef alloca
+ # define alloca(s) __builtin_alloca((s))
+ # endif
++#else
++#include <alloca.h>
+ #endif
+
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++# include "sparc_arch.h"
++extern unsigned int OPENSSL_sparcv9cap_P[];
++#endif
++
+ /* maximum precomputation table size for *variable* sliding windows */
+ #define TABLE_SIZE 32
+
+@@ -468,7 +475,15 @@
+ wstart=bits-1; /* The top bit of the window */
+ wend=0; /* The bottom bit of the window */
+
++#if 1 /* by Shay Gueron's suggestion */
++ j = mont->N.top; /* borrow j */
++ if (bn_wexpand(r,j) == NULL) goto err;
++ r->d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
++ for(i=1;i<j;i++) r->d[i] = (~m->d[i])&BN_MASK2;
++ r->top = j;
++#else
+ if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
++#endif
+ for (;;)
+ {
+ if (BN_is_bit_set(p,wstart) == 0)
+@@ -520,6 +535,17 @@
+ start=0;
+ if (wstart < 0) break;
+ }
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++ if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3|SPARCV9_PREFER_FPU)) {
++ j = mont->N.top; /* borrow j */
++ val[0]->d[0] = 1; /* borrow val[0] */
++ for (i=1;i<j;i++)
++ val[0]->d[i] = 0;
++ val[0]->top = j;
++ if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))
++ goto err;
++ } else
++#endif
+ if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+ ret=1;
+ err:
+@@ -529,7 +555,26 @@
+ return(ret);
+ }
+
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos) {
++ BN_ULONG ret = 0;
++ int wordpos;
+
++ wordpos = bitpos / BN_BITS2;
++ bitpos %= BN_BITS2;
++ if (wordpos>=0 && wordpos < a->top) {
++ ret = a->d[wordpos]&BN_MASK2;
++ if (bitpos) {
++ ret >>= bitpos;
++ if (++wordpos < a->top)
++ ret |= a->d[wordpos]<<(BN_BITS2-bitpos);
++ }
++ }
++
++ return ret & BN_MASK2;
++}
++#endif
++
+ /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
+ * so that accessing any of these table values shows the same access pattern as far
+ * as cache lines are concerned. The following functions are used to transfer a BIGNUM
+@@ -588,6 +633,9 @@
+ int powerbufLen = 0;
+ unsigned char *powerbuf=NULL;
+ BIGNUM tmp, am;
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++ unsigned int t4=0;
++#endif
+
+ bn_check_top(a);
+ bn_check_top(p);
+@@ -622,9 +670,17 @@
+
+ /* Get the window size to use with size of p. */
+ window = BN_window_bits_for_ctime_exponent_size(bits);
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++ if (window>=5 && (top&15)==0 && top<=64 &&
++ (OPENSSL_sparcv9cap_P[1]&(CFR_MONTMUL|CFR_MONTSQR))==
++ (CFR_MONTMUL|CFR_MONTSQR) && (t4=OPENSSL_sparcv9cap_P[0]))
++ window=5;
++ else
++#endif
+ #if defined(OPENSSL_BN_ASM_MONT5)
+ if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
+ #endif
++ (void) 0;
+
+ /* Allocate a buffer large enough to hold all of the pre-computed
+ * powers of am, am itself and tmp.
+@@ -657,9 +713,9 @@
+ tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+ /* prepare a^0 in Montgomery domain */
+-#if 1
++#if 0
+ if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err;
+-#else
++#else /* by Shay Gueron's suggestion */
+ tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
+ for (i=1;i<top;i++)
+ tmp.d[i] = (~m->d[i])&BN_MASK2;
+@@ -673,7 +729,122 @@
+ if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err;
+ }
+ else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err;
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++ if (t4) {
++ typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np,
++ const BN_ULONG *n0,const void *table,int power,int bits);
++ int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np,
++ const BN_ULONG *n0,const void *table,int power,int bits);
++ int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np,
++ const BN_ULONG *n0,const void *table,int power,int bits);
++ int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np,
++ const BN_ULONG *n0,const void *table,int power,int bits);
++ int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np,
++ const BN_ULONG *n0,const void *table,int power,int bits);
++ static const bn_pwr5_mont_f pwr5_funcs[4] = {
++ bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
++ bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32 };
++ bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top/16-1];
+
++ typedef int (*bn_mul_mont_f)(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++ int bn_mul_mont_t4_8(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++ int bn_mul_mont_t4_16(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++ int bn_mul_mont_t4_24(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++ int bn_mul_mont_t4_32(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++ static const bn_mul_mont_f mul_funcs[4] = {
++ bn_mul_mont_t4_8, bn_mul_mont_t4_16,
++ bn_mul_mont_t4_24, bn_mul_mont_t4_32 };
++ bn_mul_mont_f mul_worker = mul_funcs[top/16-1];
++
++ void bn_mul_mont_vis3(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *bp,const BN_ULONG *np,
++ const BN_ULONG *n0,int num);
++ void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *bp,const BN_ULONG *np,
++ const BN_ULONG *n0,int num);
++ void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap,
++ const void *table,const BN_ULONG *np,
++ const BN_ULONG *n0,int num,int power);
++ void bn_flip_n_scatter5_t4(const BN_ULONG *inp,size_t num,
++ void *table,size_t power);
++ void bn_gather5_t4(BN_ULONG *out,size_t num,
++ void *table,size_t power);
++ void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num);
++
++ BN_ULONG *np=mont->N.d, *n0=mont->n0;
++ int stride = 5*(6-(top/16-1)); /* multiple of 5, but less than 32 */
++
++ /*
++ * BN_to_montgomery can contaminate words above .top
++ * [in BN_DEBUG[_DEBUG] build]...
++ */
++ for (i=am.top; i<top; i++) am.d[i]=0;
++ for (i=tmp.top; i<top; i++) tmp.d[i]=0;
++
++ bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,0);
++ bn_flip_n_scatter5_t4(am.d,top,powerbuf,1);
++ if (!(*mul_worker)(tmp.d,am.d,am.d,np,n0) &&
++ !(*mul_worker)(tmp.d,am.d,am.d,np,n0))
++ bn_mul_mont_vis3(tmp.d,am.d,am.d,np,n0,top);
++ bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,2);
++
++ for (i=3; i<32; i++) {
++ /* Calculate a^i = a^(i-1) * a */
++ if (!(*mul_worker)(tmp.d,tmp.d,am.d,np,n0) &&
++ !(*mul_worker)(tmp.d,tmp.d,am.d,np,n0))
++ bn_mul_mont_vis3(tmp.d,tmp.d,am.d,np,n0,top);
++ bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,i);
++ }
++
++ /* switch to 64-bit domain */
++ np = alloca(top*sizeof(BN_ULONG));
++ top /= 2;
++ bn_flip_t4(np,mont->N.d,top);
++
++ bits--;
++ for (wvalue=0, i=bits%5; i>=0; i--,bits--)
++ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
++ bn_gather5_t4(tmp.d,top,powerbuf,wvalue);
++
++ /* Scan the exponent one window at a time starting from the most
++ * significant bits.
++ */
++ while (bits >= 0) {
++ if (bits < stride)
++ stride = bits+1;
++ bits -= stride;
++ wvalue = (bn_get_bits(p,bits+1));
++
++ if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride))
++ continue;
++ /* retry once and fall back */
++ if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride))
++ continue;
++
++ bits += stride-5;
++ wvalue >>= stride-5;
++ wvalue &= 31;
++ bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++ bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++ bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++ bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++ bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++ bn_mul_mont_gather5_t4(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
++ }
++
++ bn_flip_t4(tmp.d,tmp.d,top);
++ top *= 2;
++ /* back to 32-bit domain */
++ tmp.top=top;
++ bn_correct_top(&tmp);
++ OPENSSL_cleanse(np,top*sizeof(BN_ULONG));
++ } else
++#endif
+ #if defined(OPENSSL_BN_ASM_MONT5)
+ /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+ * specifically optimization of cache-timing attack countermeasures
+@@ -812,6 +983,15 @@
+ }
+
+ /* Convert the final result from montgomery to standard format */
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++ if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3|SPARCV9_PREFER_FPU)) {
++ am.d[0] = 1; /* borrow am */
++ for (i = 1; i < top; i++)
++ am.d[i] = 0;
++ if (!BN_mod_mul_montgomery(rr,&tmp,&am,mont,ctx))
++ goto err;
++ } else
++#endif
+ if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
+ ret=1;
+ err:
+Index: fips/fipssyms.h
+===================================================================
+--- a/fips/fipssyms.h 2016-10-14 11:12:58.496245385 -0700
++++ b/fips/fipssyms.h 2016-10-14 11:12:49.159899380 -0700
+@@ -476,19 +476,56 @@
+ #define SHA512_Update fips_sha512_update
+ #define SHA512_version fips_sha512_version
+ #define _shadow_DES_check_key fips__shadow_des_check_key
++#define aes_t4_decrypt fips_aes_t4_decrypt
++#define aes_t4_encrypt fips_aes_t4_encrypt
++#define aes_t4_set_decrypt_key fips_aes_t4_set_decrypt_key
++#define aes_t4_set_encrypt_key fips_aes_t4_set_encrypt_key
++#define aes128_t4_cbc_decrypt fips_aes128_t4_cbc_decrypt
++#define aes128_t4_cbc_encrypt fips_aes128_t4_cbc_encrypt
++#define aes128_t4_ctr32_encrypt fips_aes128_t4_ctr32_encrypt
++#define aes128_t4_xts_decrypt fips_aes128_t4_xts_decrypt
++#define aes128_t4_xts_encrypt fips_aes128_t4_xts_encrypt
++#define aes192_t4_cbc_decrypt fips_aes192_t4_cbc_decrypt
++#define aes192_t4_cbc_encrypt fips_aes192_t4_cbc_encrypt
++#define aes192_t4_ctr32_encrypt fips_aes192_t4_ctr32_encrypt
++#define aes256_t4_cbc_decrypt fips_aes256_t4_cbc_decrypt
++#define aes256_t4_cbc_encrypt fips_aes256_t4_cbc_encrypt
++#define aes256_t4_ctr32_encrypt fips_aes256_t4_ctr32_encrypt
++#define aes256_t4_xts_decrypt fips_aes256_t4_xts_decrypt
++#define aes256_t4_xts_encrypt fips_aes256_t4_xts_encrypt
++#define bn_GF2m_mul_2x2 fips_bn_GF2m_mul_2x2
+ #define bn_add_part_words fips_bn_add_part_words
+ #define bn_cmp_part_words fips_bn_cmp_part_words
+ #define bn_cmp_words fips_bn_cmp_words
+ #define bn_dup_expand fips_bn_dup_expand
+ #define bn_expand2 fips_bn_expand2
++#define bn_flip_n_scatter5_t4 fips_bn_flip_n_scatter5_t4
++#define bn_flip_t4 fips_bn_flip_t4
++#define bn_gather5_t4 fips_bn_gather5_t4
+ #define bn_mul_high fips_bn_mul_high
+ #define bn_mul_low_normal fips_bn_mul_low_normal
+ #define bn_mul_low_recursive fips_bn_mul_low_recursive
++#define bn_mul_mont_gather5_t4 fips_bn_mul_mont_gather5_t4
++#define bn_mul_mont_t4 fips_bn_mul_mont_t4
++#define bn_mul_mont_t4_8 fips_bn_mul_mont_t4_8
++#define bn_mul_mont_t4_16 fips_bn_mul_mont_t4_16
++#define bn_mul_mont_t4_24 fips_bn_mul_mont_t4_24
++#define bn_mul_mont_t4_32 fips_bn_mul_mont_t4_32
++#define bn_mul_mont_vis3 fips_bn_mul_mont_vis3
+ #define bn_mul_normal fips_bn_mul_normal
+ #define bn_mul_part_recursive fips_bn_mul_part_recursive
+ #define bn_mul_recursive fips_bn_mul_recursive
++#define bn_pwr5_mont_t4_8 fips_bn_pwr5_mont_t4_8
++#define bn_pwr5_mont_t4_16 fips_bn_pwr5_mont_t4_16
++#define bn_pwr5_mont_t4_24 fips_bn_pwr5_mont_t4_24
++#define bn_pwr5_mont_t4_32 fips_bn_pwr5_mont_t4_32
+ #define bn_sqr_normal fips_bn_sqr_normal
+ #define bn_sqr_recursive fips_bn_sqr_recursive
++#define des_t4_cbc_decrypt fips_des_t4_cbc_decrypt
++#define des_t4_cbc_encrypt fips_des_t4_cbc_encrypt
++#define des_t4_ede3_cbc_decrypt fips_des_t4_ede3_cbc_decrypt
++#define des_t4_ede3_cbc_encrypt fips_des_t4_ede3_cbc_encrypt
++#define des_t4_key_expand fips_des_t4_key_expand
+ #define dsa_builtin_paramgen fips_dsa_builtin_paramgen
+ #define dsa_builtin_paramgen2 fips_dsa_builtin_paramgen2
+ #define dsa_paramgen_check_g fips_dsa_paramgen_check_g
+Index: fips/sha/fips_standalone_sha1.c
+===================================================================
+--- a/fips/sha/fips_standalone_sha1.c 2016-06-20 12:49:46.000000000 -0700
++++ b/fips/sha/fips_standalone_sha1.c 2016-10-25 09:26:32.105775365 -0700
+@@ -60,6 +60,7 @@
+ void FIPS_selftest_check() {}
+ void OPENSSL_cleanse(void *p,size_t len) {}
+ unsigned int OPENSSL_ia32cap_P[2];
++unsigned int OPENSSL_sparcv9cap_P[2];
+ #endif
+
+ #ifdef OPENSSL_FIPS