24830961 OpenSSL FIPS work based on 1.0.2 for Oracle Solaris: Phase II
authorMisaki Miyashita <misaki.miyashita@oracle.com>
Wed, 02 Nov 2016 19:15:09 -0700
changeset 7239 81dd404b35f2
parent 7238 96025c3f5cac
child 7240 66893879cb20
24830961 OpenSSL FIPS work based on 1.0.2 for Oracle Solaris: Phase II
components/openssl/openssl-fips-140/patches/204-fips-by-default.patch
components/openssl/openssl-fips/Makefile
components/openssl/openssl-fips/inline-t4/aest4-sparcv9.pl
components/openssl/openssl-fips/inline-t4/dest4-sparcv9.pl
components/openssl/openssl-fips/inline-t4/e_aes.c
components/openssl/openssl-fips/inline-t4/e_des3.c
components/openssl/openssl-fips/inline-t4/sha1-sparcv9.pl
components/openssl/openssl-fips/inline-t4/sha512-sparcv9.pl
components/openssl/openssl-fips/inline-t4/sparc_arch.h
components/openssl/openssl-fips/inline-t4/sparct4-mont.pl
components/openssl/openssl-fips/inline-t4/sparcv9-gf2m.pl
components/openssl/openssl-fips/inline-t4/sparcv9_modes.pl
components/openssl/openssl-fips/inline-t4/vis3-mont.pl
components/openssl/openssl-fips/patches/302-t4-inline.patch
--- a/components/openssl/openssl-fips-140/patches/204-fips-by-default.patch	Wed Nov 02 10:26:04 2016 -0700
+++ b/components/openssl/openssl-fips-140/patches/204-fips-by-default.patch	Wed Nov 02 19:15:09 2016 -0700
@@ -1,5 +1,6 @@
 # Developed in house: Solaris specific
 # This patch enables FIPS mode in the _init routine.
+# Also, use EVP API instead of low level SHA API
 --- a/crypto/cryptlib.c 2016-09-02 14:10:14.157867400 -0700
 +++ b/crypto/cryptlib.c 2016-09-02 14:08:38.308229315 -0700
 @@ -117,6 +117,8 @@
@@ -26,3 +27,25 @@
      (void) pthread_atfork(solaris_fork_prep, solaris_fork_post, solaris_fork_post);
  }
  
+--- a/apps/speed.c	2016-09-26 02:49:07.000000000 -0700
++++ b/apps/speed.c	2016-10-25 11:26:37.455939170 -0700
+@@ -1640,7 +1640,8 @@
+             print_message(names[D_SHA256], c[D_SHA256][j], lengths[j]);
+             Time_F(START);
+             for (count = 0, run = 1; COND(c[D_SHA256][j]); count++)
+-                SHA256(buf, lengths[j], sha256);
++                EVP_Digest(buf, (unsigned long)lengths[j], sha256, NULL,
++                           EVP_sha256(), NULL);
+             d = Time_F(STOP);
+             print_result(D_SHA256, j, count, d);
+         }
+@@ -1653,7 +1654,8 @@
+             print_message(names[D_SHA512], c[D_SHA512][j], lengths[j]);
+             Time_F(START);
+             for (count = 0, run = 1; COND(c[D_SHA512][j]); count++)
+-                SHA512(buf, lengths[j], sha512);
++                EVP_Digest(buf, (unsigned long)lengths[j], sha512, NULL,
++                           EVP_sha512(), NULL);
+             d = Time_F(STOP);
+             print_result(D_SHA512, j, count, d);
+         }
--- a/components/openssl/openssl-fips/Makefile	Wed Nov 02 10:26:04 2016 -0700
+++ b/components/openssl/openssl-fips/Makefile	Wed Nov 02 19:15:09 2016 -0700
@@ -80,6 +80,20 @@
 COMPONENT_PRE_CONFIGURE_ACTION = ( cd $(@D); \
     $(RM) $(SOURCE_DIR)/test/fips_aes_data; $(CP) -r $(SOURCE_DIR)/* .; )
 
+COMPONENT_POST_UNPACK_ACTION = \
+    ( echo "Cloning engines..."; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/sparc_arch.h		$(@D)/crypto/; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/aest4-sparcv9.pl		$(@D)/crypto/aes/asm; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/dest4-sparcv9.pl		$(@D)/crypto/des/asm; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/sparcv9_modes.pl		$(@D)/crypto/perlasm; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/vis3-mont.pl		$(@D)/crypto/bn/asm; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/sparcv9-gf2m.pl		$(@D)/crypto/bn/asm; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/sparct4-mont.pl		$(@D)/crypto/bn/asm; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/e_des3.c			$(@D)/crypto/evp; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/e_aes.c			$(@D)/crypto/evp; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/sha1-sparcv9.pl		$(@D)/crypto/sha/asm; \
+	$(LN) -fs $(COMPONENT_DIR)/inline-t4/sha512-sparcv9.pl		$(@D)/crypto/sha/asm; )
+
 # There is a specific way that must be followed to build the FIPS-140 canister.
 # It is "./config fipscanisterbuild; make; make install" and is called a command
 # set "U2" in the OpenSSL FIPS-140 User Guide.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/aest4-sparcv9.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,921 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <[email protected]> and Andy Polyakov
+# <[email protected]>. The module is licensed under 2-clause BSD
+# license. October 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# AES for SPARC T4.
+#
+# AES round instructions complete in 3 cycles and can be issued every
+# cycle. It means that round calculations should take 4*rounds cycles,
+# because any given round instruction depends on result of *both*
+# previous instructions:
+#
+#	|0 |1 |2 |3 |4
+#	|01|01|01|
+#	   |23|23|23|
+#	            |01|01|...
+#	               |23|...
+#
+# Provided that fxor [with IV] takes 3 cycles to complete, critical
+# path length for CBC encrypt would be 3+4*rounds, or in other words
+# it should process one byte in at least (3+4*rounds)/16 cycles. This
+# estimate doesn't account for "collateral" instructions, such as
+# fetching input from memory, xor-ing it with zero-round key and
+# storing the result. Yet, *measured* performance [for data aligned
+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
+#
+#		128-bit key	192-		256-
+# CBC encrypt	2.70/2.90(*)	3.20/3.40	3.70/3.90
+#			 (*) numbers after slash are for
+#			     misaligned data.
+#
+# Out-of-order execution logic managed to fully overlap "collateral"
+# instructions with those on critical path. Amazing!
+#
+# As with Intel AES-NI, question is if it's possible to improve
+# performance of parallelizeable modes by interleaving round
+# instructions. Provided round instruction latency and throughput
+# optimal interleave factor is 2. But can we expect 2x performance
+# improvement? Well, as round instructions can be issued one per
+# cycle, they don't saturate the 2-way issue pipeline and therefore
+# there is room for "collateral" calculations... Yet, 2x speed-up
+# over CBC encrypt remains unattaintable:
+#
+#		128-bit key	192-		256-
+# CBC decrypt	1.64/2.11	1.89/2.37	2.23/2.61
+# CTR		1.64/2.08(*)	1.89/2.33	2.23/2.61
+#			 (*) numbers after slash are for
+#			     misaligned data.
+#
+# Estimates based on amount of instructions under assumption that
+# round instructions are not pairable with any other instruction
+# suggest that latter is the actual case and pipeline runs
+# underutilized. It should be noted that T4 out-of-order execution
+# logic is so capable that performance gain from 2x interleave is
+# not even impressive, ~7-13% over non-interleaved code, largest
+# for 256-bit keys.
+
+# To anchor to something else, software implementation processes
+# one byte in 29 cycles with 128-bit key on same processor. Intel
+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
+# in 0.93, naturally with AES-NI.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$::evp=1;	# if $evp is set to 0, script generates module with
+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
+# points. These however are not fully compatible with openssl/aes.h,
+# because they expect AES_KEY to be aligned at 64-bit boundary. When
+# used through EVP, alignment is arranged at EVP layer. Second thing
+# that is arranged by EVP is at least 32-bit alignment of IV.
+
+######################################################################
+# single-round subroutines
+#
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
+
+$code.=<<___ if ($::abibits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+
+___
+$code.=<<___;
+#include <openssl/fipssyms.h>
+
+.text
+
+.globl	aes_t4_encrypt
+.align	32
+aes_t4_encrypt:
+	andcc		$inp, 7, %g1		! is input aligned?
+	andn		$inp, 7, $inp
+
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+
+	ldx		[$inp + 0], %o4
+	bz,pt		%icc, 1f
+	ldx		[$inp + 8], %o5
+	ldx		[$inp + 16], $inp
+	sll		%g1, 3, %g1
+	sub		%g0, %g1, %o3
+	sllx		%o4, %g1, %o4
+	sllx		%o5, %g1, %g1
+	srlx		%o5, %o3, %o5
+	srlx		$inp, %o3, %o3
+	or		%o5, %o4, %o4
+	or		%o3, %g1, %o5
+1:
+	ld		[$key + 240], $rounds
+	ldd		[$key + 16], %f12
+	ldd		[$key + 24], %f14
+	xor		%g4, %o4, %o4
+	xor		%g5, %o5, %o5
+	movxtod		%o4, %f0
+	movxtod		%o5, %f2
+	srl		$rounds, 1, $rounds
+	ldd		[$key + 32], %f16
+	sub		$rounds, 1, $rounds
+	ldd		[$key + 40], %f18
+	add		$key, 48, $key
+
+.Lenc:
+	aes_eround01	%f12, %f0, %f2, %f4
+	aes_eround23	%f14, %f0, %f2, %f2
+	ldd		[$key + 0], %f12
+	ldd		[$key + 8], %f14
+	sub		$rounds,1,$rounds
+	aes_eround01	%f16, %f4, %f2, %f0
+	aes_eround23	%f18, %f4, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	brnz,pt		$rounds, .Lenc
+	add		$key, 32, $key
+
+	andcc		$out, 7, $tmp		! is output aligned?
+	aes_eround01	%f12, %f0, %f2, %f4
+	aes_eround23	%f14, %f0, %f2, %f2
+	aes_eround01_l	%f16, %f4, %f2, %f0
+	aes_eround23_l	%f18, %f4, %f2, %f2
+
+	bnz,pn		%icc, 2f
+	nop
+
+	std		%f0, [$out + 0]
+	retl
+	std		%f2, [$out + 8]
+
+2:	alignaddrl	$out, %g0, $out
+	mov		0xff, $mask
+	srl		$mask, $tmp, $mask
+
+	faligndata	%f0, %f0, %f4
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+
+	stda		%f4, [$out + $mask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $mask, $mask
+	retl
+	stda		%f8, [$out + $mask]0xc0	! partial store
+.type	aes_t4_encrypt,#function
+.size	aes_t4_encrypt,.-aes_t4_encrypt
+
+.globl	aes_t4_decrypt
+.align	32
+aes_t4_decrypt:
+	andcc		$inp, 7, %g1		! is input aligned?
+	andn		$inp, 7, $inp
+
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+
+	ldx		[$inp + 0], %o4
+	bz,pt		%icc, 1f
+	ldx		[$inp + 8], %o5
+	ldx		[$inp + 16], $inp
+	sll		%g1, 3, %g1
+	sub		%g0, %g1, %o3
+	sllx		%o4, %g1, %o4
+	sllx		%o5, %g1, %g1
+	srlx		%o5, %o3, %o5
+	srlx		$inp, %o3, %o3
+	or		%o5, %o4, %o4
+	or		%o3, %g1, %o5
+1:
+	ld		[$key + 240], $rounds
+	ldd		[$key + 16], %f12
+	ldd		[$key + 24], %f14
+	xor		%g4, %o4, %o4
+	xor		%g5, %o5, %o5
+	movxtod		%o4, %f0
+	movxtod		%o5, %f2
+	srl		$rounds, 1, $rounds
+	ldd		[$key + 32], %f16
+	sub		$rounds, 1, $rounds
+	ldd		[$key + 40], %f18
+	add		$key, 48, $key
+
+.Ldec:
+	aes_dround01	%f12, %f0, %f2, %f4
+	aes_dround23	%f14, %f0, %f2, %f2
+	ldd		[$key + 0], %f12
+	ldd		[$key + 8], %f14
+	sub		$rounds,1,$rounds
+	aes_dround01	%f16, %f4, %f2, %f0
+	aes_dround23	%f18, %f4, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	brnz,pt		$rounds, .Ldec
+	add		$key, 32, $key
+
+	andcc		$out, 7, $tmp		! is output aligned?
+	aes_dround01	%f12, %f0, %f2, %f4
+	aes_dround23	%f14, %f0, %f2, %f2
+	aes_dround01_l	%f16, %f4, %f2, %f0
+	aes_dround23_l	%f18, %f4, %f2, %f2
+
+	bnz,pn		%icc, 2f
+	nop
+
+	std		%f0, [$out + 0]
+	retl
+	std		%f2, [$out + 8]
+
+2:	alignaddrl	$out, %g0, $out
+	mov		0xff, $mask
+	srl		$mask, $tmp, $mask
+
+	faligndata	%f0, %f0, %f4
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+
+	stda		%f4, [$out + $mask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $mask, $mask
+	retl
+	stda		%f8, [$out + $mask]0xc0	! partial store
+.type	aes_t4_decrypt,#function
+.size	aes_t4_decrypt,.-aes_t4_decrypt
+___
+}
+
+######################################################################
+# key setup subroutines
+#
+{
+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
+$code.=<<___;
+.globl	aes_t4_set_encrypt_key
+.align	32
+aes_t4_set_encrypt_key:
+.Lset_encrypt_key:
+	and		$inp, 7, $tmp
+	alignaddr	$inp, %g0, $inp
+	cmp		$bits, 192
+	ldd		[$inp + 0], %f0
+	bl,pt		%icc,.L128
+	ldd		[$inp + 8], %f2
+
+	be,pt		%icc,.L192
+	ldd		[$inp + 16], %f4
+	brz,pt		$tmp, .L256aligned
+	ldd		[$inp + 24], %f6
+
+	ldd		[$inp + 32], %f8
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+	faligndata	%f4, %f6, %f4
+	faligndata	%f6, %f8, %f6
+.L256aligned:
+___
+for ($i=0; $i<6; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + `32*$i+0`]
+	aes_kexpand1	%f0, %f6, $i, %f0
+	std		%f2, [$out + `32*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `32*$i+16`]
+	aes_kexpand0	%f4, %f2, %f4
+	std		%f6, [$out + `32*$i+24`]
+	aes_kexpand2	%f6, %f4, %f6
+___
+}
+$code.=<<___;
+	std		%f0, [$out + `32*$i+0`]
+	aes_kexpand1	%f0, %f6, $i, %f0
+	std		%f2, [$out + `32*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `32*$i+16`]
+	std		%f6, [$out + `32*$i+24`]
+	std		%f0, [$out + `32*$i+32`]
+	std		%f2, [$out + `32*$i+40`]
+
+	mov		14, $tmp
+	st		$tmp, [$out + 240]
+	retl
+	xor		%o0, %o0, %o0
+
+.align	16
+.L192:
+	brz,pt		$tmp, .L192aligned
+	nop
+
+	ldd		[$inp + 24], %f6
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+	faligndata	%f4, %f6, %f4
+.L192aligned:
+___
+for ($i=0; $i<7; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + `24*$i+0`]
+	aes_kexpand1	%f0, %f4, $i, %f0
+	std		%f2, [$out + `24*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `24*$i+16`]
+	aes_kexpand2	%f4, %f2, %f4
+___
+}
+$code.=<<___;
+	std		%f0, [$out + `24*$i+0`]
+	aes_kexpand1	%f0, %f4, $i, %f0
+	std		%f2, [$out + `24*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `24*$i+16`]
+	std		%f0, [$out + `24*$i+24`]
+	std		%f2, [$out + `24*$i+32`]
+
+	mov		12, $tmp
+	st		$tmp, [$out + 240]
+	retl
+	xor		%o0, %o0, %o0
+
+.align	16
+.L128:
+	brz,pt		$tmp, .L128aligned
+	nop
+
+	ldd		[$inp + 16], %f4
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+.L128aligned:
+___
+for ($i=0; $i<10; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + `16*$i+0`]
+	aes_kexpand1	%f0, %f2, $i, %f0
+	std		%f2, [$out + `16*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+___
+}
+$code.=<<___;
+	std		%f0, [$out + `16*$i+0`]
+	std		%f2, [$out + `16*$i+8`]
+
+	mov		10, $tmp
+	st		$tmp, [$out + 240]
+	retl
+	xor		%o0, %o0, %o0
+.type	aes_t4_set_encrypt_key,#function
+.size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
+
+.globl	aes_t4_set_decrypt_key
+.align	32
+aes_t4_set_decrypt_key:
+	mov		%o7, %o5
+	call		.Lset_encrypt_key
+	nop
+
+	mov		%o5, %o7
+	sll		$tmp, 4, $inp		! $tmp is number of rounds
+	add		$tmp, 2, $tmp
+	add		$out, $inp, $inp	! $inp=$out+16*rounds
+	srl		$tmp, 2, $tmp		! $tmp=(rounds+2)/4
+
+.Lkey_flip:
+	ldd		[$out + 0],  %f0
+	ldd		[$out + 8],  %f2
+	ldd		[$out + 16], %f4
+	ldd		[$out + 24], %f6
+	ldd		[$inp + 0],  %f8
+	ldd		[$inp + 8],  %f10
+	ldd		[$inp - 16], %f12
+	ldd		[$inp - 8],  %f14
+	sub		$tmp, 1, $tmp
+	std		%f0, [$inp + 0]
+	std		%f2, [$inp + 8]
+	std		%f4, [$inp - 16]
+	std		%f6, [$inp - 8]
+	std		%f8, [$out + 0]
+	std		%f10, [$out + 8]
+	std		%f12, [$out + 16]
+	std		%f14, [$out + 24]
+	add		$out, 32, $out
+	brnz		$tmp, .Lkey_flip
+	sub		$inp, 32, $inp
+
+	retl
+	xor		%o0, %o0, %o0
+.type	aes_t4_set_decrypt_key,#function
+.size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
+___
+}
+
+{{{
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
+
+$code.=<<___;
+.align	32
+_aes128_encrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_eround01	%f48, %f0, %f2, %f4
+	aes_eround23	%f50, %f0, %f2, %f2
+	aes_eround01_l	%f52, %f4, %f2, %f0
+	retl
+	aes_eround23_l	%f54, %f4, %f2, %f2
+.type	_aes128_encrypt_1x,#function
+.size	_aes128_encrypt_1x,.-_aes128_encrypt_1x
+
+.align	32
+_aes128_encrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_eround01	%f48, %f0, %f2, %f8
+	aes_eround23	%f50, %f0, %f2, %f2
+	aes_eround01	%f48, %f4, %f6, %f10
+	aes_eround23	%f50, %f4, %f6, %f6
+	aes_eround01_l	%f52, %f8, %f2, %f0
+	aes_eround23_l	%f54, %f8, %f2, %f2
+	aes_eround01_l	%f52, %f10, %f6, %f4
+	retl
+	aes_eround23_l	%f54, %f10, %f6, %f6
+.type	_aes128_encrypt_2x,#function
+.size	_aes128_encrypt_2x,.-_aes128_encrypt_2x
+
+.align	32
+_aes128_loadkey:
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+___
+for ($i=2; $i<22;$i++) {			# load key schedule
+    $code.=<<___;
+	ldd		[$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+	retl
+	nop
+.type	_aes128_loadkey,#function
+.size	_aes128_loadkey,.-_aes128_loadkey
+_aes128_load_enckey=_aes128_loadkey
+_aes128_load_deckey=_aes128_loadkey
+
+___
+
+&alg_cbc_encrypt_implement("aes",128);
+if ($::evp) {
+    &alg_ctr32_implement("aes",128);
+    &alg_xts_implement("aes",128,"en");
+    &alg_xts_implement("aes",128,"de");
+}
+&alg_cbc_decrypt_implement("aes",128);
+
+$code.=<<___;
+.align	32
+_aes128_decrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_dround01	%f48, %f0, %f2, %f4
+	aes_dround23	%f50, %f0, %f2, %f2
+	aes_dround01_l	%f52, %f4, %f2, %f0
+	retl
+	aes_dround23_l	%f54, %f4, %f2, %f2
+.type	_aes128_decrypt_1x,#function
+.size	_aes128_decrypt_1x,.-_aes128_decrypt_1x
+
+.align	32
+_aes128_decrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_dround01	%f48, %f0, %f2, %f8
+	aes_dround23	%f50, %f0, %f2, %f2
+	aes_dround01	%f48, %f4, %f6, %f10
+	aes_dround23	%f50, %f4, %f6, %f6
+	aes_dround01_l	%f52, %f8, %f2, %f0
+	aes_dround23_l	%f54, %f8, %f2, %f2
+	aes_dround01_l	%f52, %f10, %f6, %f4
+	retl
+	aes_dround23_l	%f54, %f10, %f6, %f6
+.type	_aes128_decrypt_2x,#function
+.size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
+___
+
+$code.=<<___;
+.align	32
+_aes192_encrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_eround01	%f56, %f0, %f2, %f4
+	aes_eround23	%f58, %f0, %f2, %f2
+	aes_eround01_l	%f60, %f4, %f2, %f0
+	retl
+	aes_eround23_l	%f62, %f4, %f2, %f2
+.type	_aes192_encrypt_1x,#function
+.size	_aes192_encrypt_1x,.-_aes192_encrypt_1x
+
+.align	32
+_aes192_encrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_eround01	%f56, %f0, %f2, %f8
+	aes_eround23	%f58, %f0, %f2, %f2
+	aes_eround01	%f56, %f4, %f6, %f10
+	aes_eround23	%f58, %f4, %f6, %f6
+	aes_eround01_l	%f60, %f8, %f2, %f0
+	aes_eround23_l	%f62, %f8, %f2, %f2
+	aes_eround01_l	%f60, %f10, %f6, %f4
+	retl
+	aes_eround23_l	%f62, %f10, %f6, %f6
+.type	_aes192_encrypt_2x,#function
+.size	_aes192_encrypt_2x,.-_aes192_encrypt_2x
+
+.align	32
+_aes256_encrypt_1x:
+	aes_eround01	%f16, %f0, %f2, %f4
+	aes_eround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_eround01	%f20, %f4, %f2, %f0
+	aes_eround23	%f22, %f4, %f2, %f2
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_eround01	%f16, %f0, %f2, %f4
+	aes_eround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_eround01_l	%f20, %f4, %f2, %f0
+	aes_eround23_l	%f22, %f4, %f2, %f2
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_encrypt_1x,#function
+.size	_aes256_encrypt_1x,.-_aes256_encrypt_1x
+
+.align	32
+_aes256_encrypt_2x:
+	aes_eround01	%f16, %f0, %f2, %f8
+	aes_eround23	%f18, %f0, %f2, %f2
+	aes_eround01	%f16, %f4, %f6, %f10
+	aes_eround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_eround01	%f20, %f8, %f2, %f0
+	aes_eround23	%f22, %f8, %f2, %f2
+	aes_eround01	%f20, %f10, %f6, %f4
+	aes_eround23	%f22, %f10, %f6, %f6
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_eround01	%f16, %f0, %f2, %f8
+	aes_eround23	%f18, %f0, %f2, %f2
+	aes_eround01	%f16, %f4, %f6, %f10
+	aes_eround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_eround01_l	%f20, %f8, %f2, %f0
+	aes_eround23_l	%f22, %f8, %f2, %f2
+	aes_eround01_l	%f20, %f10, %f6, %f4
+	aes_eround23_l	%f22, %f10, %f6, %f6
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_encrypt_2x,#function
+.size	_aes256_encrypt_2x,.-_aes256_encrypt_2x
+
+.align	32
+_aes192_loadkey:
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+___
+for ($i=2; $i<26;$i++) {			# load key schedule
+    $code.=<<___;
+	ldd		[$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+	retl
+	nop
+.type	_aes192_loadkey,#function
+.size	_aes192_loadkey,.-_aes192_loadkey
+_aes256_loadkey=_aes192_loadkey
+_aes192_load_enckey=_aes192_loadkey
+_aes192_load_deckey=_aes192_loadkey
+_aes256_load_enckey=_aes192_loadkey
+_aes256_load_deckey=_aes192_loadkey
+___
+
+&alg_cbc_encrypt_implement("aes",256);
+&alg_cbc_encrypt_implement("aes",192);
+if ($::evp) {
+    &alg_ctr32_implement("aes",256);
+    &alg_xts_implement("aes",256,"en");
+    &alg_xts_implement("aes",256,"de");
+    &alg_ctr32_implement("aes",192);
+}
+&alg_cbc_decrypt_implement("aes",192);
+&alg_cbc_decrypt_implement("aes",256);
+
+$code.=<<___;
+.align	32
+_aes256_decrypt_1x:
+	aes_dround01	%f16, %f0, %f2, %f4
+	aes_dround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_dround01	%f20, %f4, %f2, %f0
+	aes_dround23	%f22, %f4, %f2, %f2
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_dround01	%f16, %f0, %f2, %f4
+	aes_dround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_dround01_l	%f20, %f4, %f2, %f0
+	aes_dround23_l	%f22, %f4, %f2, %f2
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_decrypt_1x,#function
+.size	_aes256_decrypt_1x,.-_aes256_decrypt_1x
+
+.align	32
+_aes256_decrypt_2x:
+	aes_dround01	%f16, %f0, %f2, %f8
+	aes_dround23	%f18, %f0, %f2, %f2
+	aes_dround01	%f16, %f4, %f6, %f10
+	aes_dround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_dround01	%f20, %f8, %f2, %f0
+	aes_dround23	%f22, %f8, %f2, %f2
+	aes_dround01	%f20, %f10, %f6, %f4
+	aes_dround23	%f22, %f10, %f6, %f6
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_dround01	%f16, %f0, %f2, %f8
+	aes_dround23	%f18, %f0, %f2, %f2
+	aes_dround01	%f16, %f4, %f6, %f10
+	aes_dround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_dround01_l	%f20, %f8, %f2, %f0
+	aes_dround23_l	%f22, %f8, %f2, %f2
+	aes_dround01_l	%f20, %f10, %f6, %f4
+	aes_dround23_l	%f22, %f10, %f6, %f6
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_decrypt_2x,#function
+.size	_aes256_decrypt_2x,.-_aes256_decrypt_2x
+
+.align	32
+_aes192_decrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_dround01	%f56, %f0, %f2, %f4
+	aes_dround23	%f58, %f0, %f2, %f2
+	aes_dround01_l	%f60, %f4, %f2, %f0
+	retl
+	aes_dround23_l	%f62, %f4, %f2, %f2
+.type	_aes192_decrypt_1x,#function
+.size	_aes192_decrypt_1x,.-_aes192_decrypt_1x
+
+.align	32
+_aes192_decrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_dround01	%f56, %f0, %f2, %f8
+	aes_dround23	%f58, %f0, %f2, %f2
+	aes_dround01	%f56, %f4, %f6, %f10
+	aes_dround23	%f58, %f4, %f6, %f6
+	aes_dround01_l	%f60, %f8, %f2, %f0
+	aes_dround23_l	%f62, %f8, %f2, %f2
+	aes_dround01_l	%f60, %f10, %f6, %f4
+	retl
+	aes_dround23_l	%f62, %f10, %f6, %f6
+.type	_aes192_decrypt_2x,#function
+.size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
+___
+}}}
+
+if (!$::evp) {
+$code.=<<___;
+.global	AES_encrypt
+AES_encrypt=aes_t4_encrypt
+.global	AES_decrypt
+AES_decrypt=aes_t4_decrypt
+.global	AES_set_encrypt_key
+.align	32
+AES_set_encrypt_key:
+	andcc		%o2, 7, %g0		! check alignment
+	bnz,a,pn	%icc, 1f
+	mov		-1, %o0
+	brz,a,pn	%o0, 1f
+	mov		-1, %o0
+	brz,a,pn	%o2, 1f
+	mov		-1, %o0
+	andncc		%o1, 0x1c0, %g0
+	bnz,a,pn	%icc, 1f
+	mov		-2, %o0
+	cmp		%o1, 128
+	bl,a,pn		%icc, 1f
+	mov		-2, %o0
+	b		aes_t4_set_encrypt_key
+	nop
+1:	retl
+	nop
+.type	AES_set_encrypt_key,#function
+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global	AES_set_decrypt_key
+.align	32
+AES_set_decrypt_key:
+	andcc		%o2, 7, %g0		! check alignment
+	bnz,a,pn	%icc, 1f
+	mov		-1, %o0
+	brz,a,pn	%o0, 1f
+	mov		-1, %o0
+	brz,a,pn	%o2, 1f
+	mov		-1, %o0
+	andncc		%o1, 0x1c0, %g0
+	bnz,a,pn	%icc, 1f
+	mov		-2, %o0
+	cmp		%o1, 128
+	bl,a,pn		%icc, 1f
+	mov		-2, %o0
+	b		aes_t4_set_decrypt_key
+	nop
+1:	retl
+	nop
+.type	AES_set_decrypt_key,#function
+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
+
+$code.=<<___;
+.globl	AES_cbc_encrypt
+.align	32
+AES_cbc_encrypt:
+	ld		[$key + 240], %g1
+	nop
+	brz		$enc, .Lcbc_decrypt
+	cmp		%g1, 12
+
+	bl,pt		%icc, aes128_t4_cbc_encrypt
+	nop
+	be,pn		%icc, aes192_t4_cbc_encrypt
+	nop
+	ba		aes256_t4_cbc_encrypt
+	nop
+
+.Lcbc_decrypt:
+	bl,pt		%icc, aes128_t4_cbc_decrypt
+	nop
+	be,pn		%icc, aes192_t4_cbc_decrypt
+	nop
+	ba		aes256_t4_cbc_decrypt
+	nop
+.type	AES_cbc_encrypt,#function
+.size	AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+$code.=<<___;
+.asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
+.align	4
+___
+
+&emit_assembler();
+
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/dest4-sparcv9.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,620 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <[email protected]> and Andy Polyakov
+# <[email protected]>. The module is licensed under 2-clause BSD
+# license. March 2013. All rights reserved.
+# ====================================================================
+
+######################################################################
+# DES for SPARC T4.
+#
+# As with other hardware-assisted ciphers CBC encrypt results [for
+# aligned data] are virtually identical to critical path lengths:
+#
+#		DES		Triple-DES
+# CBC encrypt	4.14/4.15(*)	11.7/11.7
+# CBC decrypt	1.77/4.11(**)	6.42/7.47
+#
+#			 (*)	numbers after slash are for
+#				misaligned data;
+#			 (**)	this is result for largest
+#				block size, unlike all other
+#				cases smaller blocks results
+#				are better[?];
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$code.=<<___ if ($::abibits==64);
+.register       %g2,#scratch
+.register       %g3,#scratch
+___
+
+$code.=<<___;
+
+#include <openssl/fipssyms.h>
+
+.text
+___
+
+{ my ($inp,$out)=("%o0","%o1");
+
+$code.=<<___;
+.align	32
+.globl	des_t4_key_expand
+.type	des_t4_key_expand,#function
+des_t4_key_expand:
+	andcc		$inp, 0x7, %g0
+	alignaddr	$inp, %g0, $inp
+	bz,pt		%icc, 1f
+	ldd		[$inp + 0x00], %f0
+	ldd		[$inp + 0x08], %f2
+	faligndata	%f0, %f2, %f0
+1:	des_kexpand	%f0, 0, %f0
+	des_kexpand	%f0, 1, %f2
+	std		%f0, [$out + 0x00]
+	des_kexpand	%f2, 3, %f6
+	std		%f2, [$out + 0x08]
+	des_kexpand	%f2, 2, %f4
+	des_kexpand	%f6, 3, %f10
+	std		%f6, [$out + 0x18]
+	des_kexpand	%f6, 2, %f8
+	std		%f4, [$out + 0x10]
+	des_kexpand	%f10, 3, %f14
+	std		%f10, [$out + 0x28]
+	des_kexpand	%f10, 2, %f12
+	std		%f8, [$out + 0x20]
+	des_kexpand	%f14, 1, %f16
+	std		%f14, [$out + 0x38]
+	des_kexpand	%f16, 3, %f20
+	std		%f12, [$out + 0x30]
+	des_kexpand	%f16, 2, %f18
+	std		%f16, [$out + 0x40]
+	des_kexpand	%f20, 3, %f24
+	std		%f20, [$out + 0x50]
+	des_kexpand	%f20, 2, %f22
+	std		%f18, [$out + 0x48]
+	des_kexpand	%f24, 3, %f28
+	std		%f24, [$out + 0x60]
+	des_kexpand	%f24, 2, %f26
+	std		%f22, [$out + 0x58]
+	des_kexpand	%f28, 1, %f30
+	std		%f28, [$out + 0x70]
+	std		%f26, [$out + 0x68]
+	retl
+	std		%f30, [$out + 0x78]
+.size	des_t4_key_expand,.-des_t4_key_expand
+___
+}
+{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
+  my ($ileft,$iright,$omask) = map("%g$_",(1..3));
+
+$code.=<<___;
+.globl	des_t4_cbc_encrypt
+.align	32
+des_t4_cbc_encrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f0	! load ivec
+	ld		[$ivec + 4], %f1
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x00], %f4	! load key schedule
+	ldd		[$key + 0x08], %f6
+	ldd		[$key + 0x10], %f8
+	ldd		[$key + 0x18], %f10
+	ldd		[$key + 0x20], %f12
+	ldd		[$key + 0x28], %f14
+	ldd		[$key + 0x30], %f16
+	ldd		[$key + 0x38], %f18
+	ldd		[$key + 0x40], %f20
+	ldd		[$key + 0x48], %f22
+	ldd		[$key + 0x50], %f24
+	ldd		[$key + 0x58], %f26
+	ldd		[$key + 0x60], %f28
+	ldd		[$key + 0x68], %f30
+	ldd		[$key + 0x70], %f32
+	ldd		[$key + 0x78], %f34
+
+.Ldes_cbc_enc_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f2
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	fxor		%f2, %f0, %f0		! ^= ivec
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	des_round	%f20, %f22, %f0, %f0
+	des_round	%f24, %f26, %f0, %f0
+	des_round	%f28, %f30, %f0, %f0
+	des_round	%f32, %f34, %f0, %f0
+	des_iip		%f0, %f0
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_cbc_enc_loop
+	add		$out, 8, $out
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+.Lcbc_abort:
+	retl
+	nop
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~4x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f2		! handle unaligned output
+
+	stda		%f2, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f2, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_cbc_enc_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+.type	des_t4_cbc_encrypt,#function
+.size	des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
+
+.globl	des_t4_cbc_decrypt
+.align	32
+des_t4_cbc_decrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f2	! load ivec
+	ld		[$ivec + 4], %f3
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x78], %f4	! load key schedule
+	ldd		[$key + 0x70], %f6
+	ldd		[$key + 0x68], %f8
+	ldd		[$key + 0x60], %f10
+	ldd		[$key + 0x58], %f12
+	ldd		[$key + 0x50], %f14
+	ldd		[$key + 0x48], %f16
+	ldd		[$key + 0x40], %f18
+	ldd		[$key + 0x38], %f20
+	ldd		[$key + 0x30], %f22
+	ldd		[$key + 0x28], %f24
+	ldd		[$key + 0x20], %f26
+	ldd		[$key + 0x18], %f28
+	ldd		[$key + 0x10], %f30
+	ldd		[$key + 0x08], %f32
+	ldd		[$key + 0x00], %f34
+
+.Ldes_cbc_dec_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f0
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	des_round	%f20, %f22, %f0, %f0
+	des_round	%f24, %f26, %f0, %f0
+	des_round	%f28, %f30, %f0, %f0
+	des_round	%f32, %f34, %f0, %f0
+	des_iip		%f0, %f0
+
+	fxor		%f2, %f0, %f0		! ^= ivec
+	movxtod		%g4, %f2
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_cbc_dec_loop
+	add		$out, 8, $out
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~4x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f0		! handle unaligned output
+
+	stda		%f0, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f0, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_cbc_dec_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+.type	des_t4_cbc_decrypt,#function
+.size	des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
+___
+
+# One might wonder why does one have back-to-back des_iip/des_ip
+# pairs between EDE passes. Indeed, aren't they inverse of each other?
+# They almost are. Outcome of the pair is 32-bit words being swapped
+# in target register. Consider pair of des_iip/des_ip as a way to
+# perform the due swap, it's actually fastest way in this case.
+
+$code.=<<___;
+.globl	des_t4_ede3_cbc_encrypt
+.align	32
+des_t4_ede3_cbc_encrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f0	! load ivec
+	ld		[$ivec + 4], %f1
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x00], %f4	! load key schedule
+	ldd		[$key + 0x08], %f6
+	ldd		[$key + 0x10], %f8
+	ldd		[$key + 0x18], %f10
+	ldd		[$key + 0x20], %f12
+	ldd		[$key + 0x28], %f14
+	ldd		[$key + 0x30], %f16
+	ldd		[$key + 0x38], %f18
+	ldd		[$key + 0x40], %f20
+	ldd		[$key + 0x48], %f22
+	ldd		[$key + 0x50], %f24
+	ldd		[$key + 0x58], %f26
+	ldd		[$key + 0x60], %f28
+	ldd		[$key + 0x68], %f30
+	ldd		[$key + 0x70], %f32
+	ldd		[$key + 0x78], %f34
+
+.Ldes_ede3_cbc_enc_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f2
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	fxor		%f2, %f0, %f0		! ^= ivec
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	ldd		[$key + 0x100-0x08], %f36
+	ldd		[$key + 0x100-0x10], %f38
+	des_round	%f20, %f22, %f0, %f0
+	ldd		[$key + 0x100-0x18], %f40
+	ldd		[$key + 0x100-0x20], %f42
+	des_round	%f24, %f26, %f0, %f0
+	ldd		[$key + 0x100-0x28], %f44
+	ldd		[$key + 0x100-0x30], %f46
+	des_round	%f28, %f30, %f0, %f0
+	ldd		[$key + 0x100-0x38], %f48
+	ldd		[$key + 0x100-0x40], %f50
+	des_round	%f32, %f34, %f0, %f0
+	ldd		[$key + 0x100-0x48], %f52
+	ldd		[$key + 0x100-0x50], %f54
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x100-0x58], %f56
+	ldd		[$key + 0x100-0x60], %f58
+	des_ip		%f0, %f0
+	ldd		[$key + 0x100-0x68], %f60
+	ldd		[$key + 0x100-0x70], %f62
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x100-0x78], %f36
+	ldd		[$key + 0x100-0x80], %f38
+	des_round	%f40, %f42, %f0, %f0
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	ldd		[$key + 0x100+0x00], %f40
+	ldd		[$key + 0x100+0x08], %f42
+	des_round	%f52, %f54, %f0, %f0
+	ldd		[$key + 0x100+0x10], %f44
+	ldd		[$key + 0x100+0x18], %f46
+	des_round	%f56, %f58, %f0, %f0
+	ldd		[$key + 0x100+0x20], %f48
+	ldd		[$key + 0x100+0x28], %f50
+	des_round	%f60, %f62, %f0, %f0
+	ldd		[$key + 0x100+0x30], %f52
+	ldd		[$key + 0x100+0x38], %f54
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x100+0x40], %f56
+	ldd		[$key + 0x100+0x48], %f58
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x100+0x50], %f60
+	ldd		[$key + 0x100+0x58], %f62
+	des_ip		%f0, %f0
+	ldd		[$key + 0x100+0x60], %f36
+	ldd		[$key + 0x100+0x68], %f38
+	des_round	%f40, %f42, %f0, %f0
+	ldd		[$key + 0x100+0x70], %f40
+	ldd		[$key + 0x100+0x78], %f42
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	des_round	%f52, %f54, %f0, %f0
+	des_round	%f56, %f58, %f0, %f0
+	des_round	%f60, %f62, %f0, %f0
+	des_round	%f36, %f38, %f0, %f0
+	des_round	%f40, %f42, %f0, %f0
+	des_iip		%f0, %f0
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop
+	add		$out, 8, $out
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~2x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f2		! handle unaligned output
+
+	stda		%f2, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f2, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f0, [$ivec + 0]	! write out ivec
+	retl
+	st		%f1, [$ivec + 4]
+.type	des_t4_ede3_cbc_encrypt,#function
+.size	des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
+
+.globl	des_t4_ede3_cbc_decrypt
+.align	32
+des_t4_ede3_cbc_decrypt:
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .Lcbc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	ld		[$ivec + 0], %f2	! load ivec
+	ld		[$ivec + 4], %f3
+
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		0xff, $omask
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	sub		%g0, $ileft, $iright
+	and		$out, 7, %g4
+	alignaddrl	$out, %g0, $out
+	srl		$omask, %g4, $omask
+	srlx		$len, 3, $len
+	movrz		%g4, 0, $omask
+	prefetch	[$out], 22
+
+	ldd		[$key + 0x100+0x78], %f4	! load key schedule
+	ldd		[$key + 0x100+0x70], %f6
+	ldd		[$key + 0x100+0x68], %f8
+	ldd		[$key + 0x100+0x60], %f10
+	ldd		[$key + 0x100+0x58], %f12
+	ldd		[$key + 0x100+0x50], %f14
+	ldd		[$key + 0x100+0x48], %f16
+	ldd		[$key + 0x100+0x40], %f18
+	ldd		[$key + 0x100+0x38], %f20
+	ldd		[$key + 0x100+0x30], %f22
+	ldd		[$key + 0x100+0x28], %f24
+	ldd		[$key + 0x100+0x20], %f26
+	ldd		[$key + 0x100+0x18], %f28
+	ldd		[$key + 0x100+0x10], %f30
+	ldd		[$key + 0x100+0x08], %f32
+	ldd		[$key + 0x100+0x00], %f34
+
+.Ldes_ede3_cbc_dec_loop:
+	ldx		[$inp + 0], %g4
+	brz,pt		$ileft, 4f
+	nop
+
+	ldx		[$inp + 8], %g5
+	sllx		%g4, $ileft, %g4
+	srlx		%g5, $iright, %g5
+	or		%g5, %g4, %g4
+4:
+	movxtod		%g4, %f0
+	prefetch	[$inp + 8+63], 20
+	add		$inp, 8, $inp
+	prefetch	[$out + 63], 22
+
+	des_ip		%f0, %f0
+	des_round	%f4, %f6, %f0, %f0
+	des_round	%f8, %f10, %f0, %f0
+	des_round	%f12, %f14, %f0, %f0
+	des_round	%f16, %f18, %f0, %f0
+	ldd		[$key + 0x80+0x00], %f36
+	ldd		[$key + 0x80+0x08], %f38
+	des_round	%f20, %f22, %f0, %f0
+	ldd		[$key + 0x80+0x10], %f40
+	ldd		[$key + 0x80+0x18], %f42
+	des_round	%f24, %f26, %f0, %f0
+	ldd		[$key + 0x80+0x20], %f44
+	ldd		[$key + 0x80+0x28], %f46
+	des_round	%f28, %f30, %f0, %f0
+	ldd		[$key + 0x80+0x30], %f48
+	ldd		[$key + 0x80+0x38], %f50
+	des_round	%f32, %f34, %f0, %f0
+	ldd		[$key + 0x80+0x40], %f52
+	ldd		[$key + 0x80+0x48], %f54
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x80+0x50], %f56
+	ldd		[$key + 0x80+0x58], %f58
+	des_ip		%f0, %f0
+	ldd		[$key + 0x80+0x60], %f60
+	ldd		[$key + 0x80+0x68], %f62
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x80+0x70], %f36
+	ldd		[$key + 0x80+0x78], %f38
+	des_round	%f40, %f42, %f0, %f0
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	ldd		[$key + 0x80-0x08], %f40
+	ldd		[$key + 0x80-0x10], %f42
+	des_round	%f52, %f54, %f0, %f0
+	ldd		[$key + 0x80-0x18], %f44
+	ldd		[$key + 0x80-0x20], %f46
+	des_round	%f56, %f58, %f0, %f0
+	ldd		[$key + 0x80-0x28], %f48
+	ldd		[$key + 0x80-0x30], %f50
+	des_round	%f60, %f62, %f0, %f0
+	ldd		[$key + 0x80-0x38], %f52
+	ldd		[$key + 0x80-0x40], %f54
+	des_round	%f36, %f38, %f0, %f0
+	ldd		[$key + 0x80-0x48], %f56
+	ldd		[$key + 0x80-0x50], %f58
+	des_iip		%f0, %f0
+
+	ldd		[$key + 0x80-0x58], %f60
+	ldd		[$key + 0x80-0x60], %f62
+	des_ip		%f0, %f0
+	ldd		[$key + 0x80-0x68], %f36
+	ldd		[$key + 0x80-0x70], %f38
+	des_round	%f40, %f42, %f0, %f0
+	ldd		[$key + 0x80-0x78], %f40
+	ldd		[$key + 0x80-0x80], %f42
+	des_round	%f44, %f46, %f0, %f0
+	des_round	%f48, %f50, %f0, %f0
+	des_round	%f52, %f54, %f0, %f0
+	des_round	%f56, %f58, %f0, %f0
+	des_round	%f60, %f62, %f0, %f0
+	des_round	%f36, %f38, %f0, %f0
+	des_round	%f40, %f42, %f0, %f0
+	des_iip		%f0, %f0
+
+	fxor		%f2, %f0, %f0		! ^= ivec
+	movxtod		%g4, %f2
+
+	brnz,pn		$omask, 2f
+	sub		$len, 1, $len
+
+	std		%f0, [$out + 0]
+	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop
+	add		$out, 8, $out
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+
+.align	16
+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f0		! handle unaligned output
+
+	stda		%f0, [$out + $omask]0xc0	! partial store
+	add		$out, 8, $out
+	orn		%g0, $omask, $omask
+	stda		%f0, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop+4
+	orn		%g0, $omask, $omask
+
+	st		%f2, [$ivec + 0]	! write out ivec
+	retl
+	st		%f3, [$ivec + 4]
+.type	des_t4_ede3_cbc_decrypt,#function
+.size	des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
+___
+}
+$code.=<<___;
+.asciz  "DES for SPARC T4, David S. Miller, Andy Polyakov"
+.align  4
+___
+
+&emit_assembler();
+
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/e_aes.c	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,1922 @@
+/* ====================================================================
+ * Copyright (c) 2001-2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_AES
+#include <openssl/crypto.h>
+# include <openssl/evp.h>
+# include <openssl/err.h>
+# include <string.h>
+# include <assert.h>
+# include <openssl/aes.h>
+# include "evp_locl.h"
+# include "modes_lcl.h"
+# include <openssl/rand.h>
+
+typedef struct {
+    union {
+        double align;
+        AES_KEY ks;
+    } ks;
+    block128_f block;
+    union {
+        cbc128_f cbc;
+        ctr128_f ctr;
+    } stream;
+} EVP_AES_KEY;
+
+typedef struct {
+    union {
+        double align;
+        AES_KEY ks;
+    } ks;                       /* AES key schedule to use */
+    int key_set;                /* Set if key initialised */
+    int iv_set;                 /* Set if an iv is set */
+    GCM128_CONTEXT gcm;
+    unsigned char *iv;          /* Temporary IV store */
+    int ivlen;                  /* IV length */
+    int taglen;
+    int iv_gen;                 /* It is OK to generate IVs */
+    int tls_aad_len;            /* TLS AAD length */
+    ctr128_f ctr;
+} EVP_AES_GCM_CTX;
+
+typedef struct {
+    union {
+        double align;
+        AES_KEY ks;
+    } ks1, ks2;                 /* AES key schedules to use */
+    XTS128_CONTEXT xts;
+    void (*stream) (const unsigned char *in,
+                    unsigned char *out, size_t length,
+                    const AES_KEY *key1, const AES_KEY *key2,
+                    const unsigned char iv[16]);
+} EVP_AES_XTS_CTX;
+
+typedef struct {
+    union {
+        double align;
+        AES_KEY ks;
+    } ks;                       /* AES key schedule to use */
+    int key_set;                /* Set if key initialised */
+    int iv_set;                 /* Set if an iv is set */
+    int tag_set;                /* Set if tag is valid */
+    int len_set;                /* Set if message length set */
+    int L, M;                   /* L and M parameters from RFC3610 */
+    CCM128_CONTEXT ccm;
+    ccm128_f str;
+} EVP_AES_CCM_CTX;
+
+# define MAXBITCHUNK     ((size_t)1<<(sizeof(size_t)*8-4))
+
+# ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+                          AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+                          AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+                   const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+                   const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+                       unsigned char *out,
+                       size_t length,
+                       const AES_KEY *key, unsigned char *ivec, int enc);
+# endif
+# ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const AES_KEY *key,
+                       unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                size_t len, const AES_KEY *key,
+                                const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+                       size_t len, const AES_KEY *key1,
+                       const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+                       size_t len, const AES_KEY *key1,
+                       const AES_KEY *key2, const unsigned char iv[16]);
+# endif
+# ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t blocks, const AES_KEY *key,
+                       const unsigned char ivec[AES_BLOCK_SIZE]);
+# endif
+# ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp, char *out, size_t len,
+                     const AES_KEY *key1, const AES_KEY *key2,
+                     const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp, char *out, size_t len,
+                     const AES_KEY *key1, const AES_KEY *key2,
+                     const unsigned char iv[16]);
+# endif
+
+# if     defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+#  include "ppc_arch.h"
+#  ifdef VPAES_ASM
+#   define VPAES_CAPABLE (OPENSSL_ppccap_P & PPC_ALTIVEC)
+#  endif
+#  define HWAES_CAPABLE  (OPENSSL_ppccap_P & PPC_CRYPTO207)
+#  define HWAES_set_encrypt_key aes_p8_set_encrypt_key
+#  define HWAES_set_decrypt_key aes_p8_set_decrypt_key
+#  define HWAES_encrypt aes_p8_encrypt
+#  define HWAES_decrypt aes_p8_decrypt
+#  define HWAES_cbc_encrypt aes_p8_cbc_encrypt
+#  define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
+# endif
+
+# if     defined(AES_ASM) && !defined(I386_ONLY) &&      (  \
+        ((defined(__i386)       || defined(__i386__)    || \
+          defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+        defined(__x86_64)       || defined(__x86_64__)  || \
+        defined(_M_AMD64)       || defined(_M_X64)      || \
+        defined(__INTEL__)                              )
+
+extern unsigned int OPENSSL_ia32cap_P[];
+
+#  ifdef VPAES_ASM
+#   define VPAES_CAPABLE   (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#  endif
+#  ifdef BSAES_ASM
+#   define BSAES_CAPABLE   (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#  endif
+/*
+ * AES-NI section
+ */
+#  define AESNI_CAPABLE   (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+                          AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+                          AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+                   const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+                   const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+                       unsigned char *out,
+                       size_t length, const AES_KEY *key, int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+                       unsigned char *out,
+                       size_t length,
+                       const AES_KEY *key, unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+                                unsigned char *out,
+                                size_t blocks,
+                                const void *key, const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+                       unsigned char *out,
+                       size_t length,
+                       const AES_KEY *key1, const AES_KEY *key2,
+                       const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+                       unsigned char *out,
+                       size_t length,
+                       const AES_KEY *key1, const AES_KEY *key2,
+                       const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks(const unsigned char *in,
+                                unsigned char *out,
+                                size_t blocks,
+                                const void *key,
+                                const unsigned char ivec[16],
+                                unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks(const unsigned char *in,
+                                unsigned char *out,
+                                size_t blocks,
+                                const void *key,
+                                const unsigned char ivec[16],
+                                unsigned char cmac[16]);
+
+#  if defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+size_t aesni_gcm_encrypt(const unsigned char *in,
+                         unsigned char *out,
+                         size_t len,
+                         const void *key, unsigned char ivec[16], u64 *Xi);
+#   define AES_gcm_encrypt aesni_gcm_encrypt
+size_t aesni_gcm_decrypt(const unsigned char *in,
+                         unsigned char *out,
+                         size_t len,
+                         const void *key, unsigned char ivec[16], u64 *Xi);
+#   define AES_gcm_decrypt aesni_gcm_decrypt
+void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *in,
+                   size_t len);
+#   define AES_GCM_ASM(gctx)       (gctx->ctr==aesni_ctr32_encrypt_blocks && \
+                                 gctx->gcm.ghash==gcm_ghash_avx)
+#   define AES_GCM_ASM2(gctx)      (gctx->gcm.block==(block128_f)aesni_encrypt && \
+                                 gctx->gcm.ghash==gcm_ghash_avx)
+#   undef AES_GCM_ASM2          /* minor size optimization */
+#  endif
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                          const unsigned char *iv, int enc)
+{
+    int ret, mode;
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    mode = ctx->cipher->flags & EVP_CIPH_MODE;
+    if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+        && !enc) {
+        ret = aesni_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+        dat->block = (block128_f) aesni_decrypt;
+        dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+            (cbc128_f) aesni_cbc_encrypt : NULL;
+    } else {
+        ret = aesni_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+        dat->block = (block128_f) aesni_encrypt;
+        if (mode == EVP_CIPH_CBC_MODE)
+            dat->stream.cbc = (cbc128_f) aesni_cbc_encrypt;
+        else if (mode == EVP_CIPH_CTR_MODE)
+            dat->stream.ctr = (ctr128_f) aesni_ctr32_encrypt_blocks;
+        else
+            dat->stream.cbc = NULL;
+    }
+
+    if (ret < 0) {
+        EVPerr(EVP_F_AESNI_INIT_KEY, EVP_R_AES_KEY_SETUP_FAILED);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len)
+{
+    aesni_cbc_encrypt(in, out, len, ctx->cipher_data, ctx->iv, ctx->encrypt);
+
+    return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len)
+{
+    size_t bl = ctx->cipher->block_size;
+
+    if (len < bl)
+        return 1;
+
+    aesni_ecb_encrypt(in, out, len, ctx->cipher_data, ctx->encrypt);
+
+    return 1;
+}
+
+#  define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len);
+
+#  define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len);
+
+#  define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+#  define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+#  define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                              const unsigned char *iv, int enc)
+{
+    EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+    if (key) {
+        aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+        CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f) aesni_encrypt);
+        gctx->ctr = (ctr128_f) aesni_ctr32_encrypt_blocks;
+        /*
+         * If we have an iv can set it directly, otherwise use saved IV.
+         */
+        if (iv == NULL && gctx->iv_set)
+            iv = gctx->iv;
+        if (iv) {
+            CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+            gctx->iv_set = 1;
+        }
+        gctx->key_set = 1;
+    } else {
+        /* If key set use IV, otherwise copy */
+        if (gctx->key_set)
+            CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+        else
+            memcpy(gctx->iv, iv, gctx->ivlen);
+        gctx->iv_set = 1;
+        gctx->iv_gen = 0;
+    }
+    return 1;
+}
+
+#  define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                              const unsigned char *iv, int enc)
+{
+    EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+
+    if (key) {
+        /* key_len is two AES keys */
+        if (enc) {
+            aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+            xctx->xts.block1 = (block128_f) aesni_encrypt;
+            xctx->stream = aesni_xts_encrypt;
+        } else {
+            aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+            xctx->xts.block1 = (block128_f) aesni_decrypt;
+            xctx->stream = aesni_xts_decrypt;
+        }
+
+        aesni_set_encrypt_key(key + ctx->key_len / 2,
+                              ctx->key_len * 4, &xctx->ks2.ks);
+        xctx->xts.block2 = (block128_f) aesni_encrypt;
+
+        xctx->xts.key1 = &xctx->ks1;
+    }
+
+    if (iv) {
+        xctx->xts.key2 = &xctx->ks2;
+        memcpy(ctx->iv, iv, 16);
+    }
+
+    return 1;
+}
+
+#  define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                              const unsigned char *iv, int enc)
+{
+    EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+    if (key) {
+        aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+        CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+                           &cctx->ks, (block128_f) aesni_encrypt);
+        cctx->str = enc ? (ccm128_f) aesni_ccm64_encrypt_blocks :
+            (ccm128_f) aesni_ccm64_decrypt_blocks;
+        cctx->key_set = 1;
+    }
+    if (iv) {
+        memcpy(ctx->iv, iv, 15 - cctx->L);
+        cctx->iv_set = 1;
+    }
+    return 1;
+}
+
+#  define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                            const unsigned char *in, size_t len);
+
+#  define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+        nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aesni_init_key,                 \
+        aesni_##mode##_cipher,          \
+        NULL,                           \
+        sizeof(EVP_AES_KEY),            \
+        NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+        nid##_##keylen##_##nmode,blocksize,     \
+        keylen/8,ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_init_key,                   \
+        aes_##mode##_cipher,            \
+        NULL,                           \
+        sizeof(EVP_AES_KEY),            \
+        NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#  define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+        nid##_##keylen##_##mode,blocksize, \
+        (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aesni_##mode##_init_key,        \
+        aesni_##mode##_cipher,          \
+        aes_##mode##_cleanup,           \
+        sizeof(EVP_AES_##MODE##_CTX),   \
+        NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+        nid##_##keylen##_##mode,blocksize, \
+        (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_##mode##_init_key,          \
+        aes_##mode##_cipher,            \
+        aes_##mode##_cleanup,           \
+        sizeof(EVP_AES_##MODE##_CTX),   \
+        NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# elif   defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+
+#  include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+#  define SPARC_AES_CAPABLE       (OPENSSL_sparcv9cap_P[1] & CFR_AES)
+
+void aes_t4_set_encrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
+void aes_t4_set_decrypt_key(const unsigned char *key, int bits, AES_KEY *ks);
+void aes_t4_encrypt(const unsigned char *in, unsigned char *out,
+                    const AES_KEY *key);
+void aes_t4_decrypt(const unsigned char *in, unsigned char *out,
+                    const AES_KEY *key);
+/*
+ * Key-length specific subroutines were chosen for following reason.
+ * Each SPARC T4 core can execute up to 8 threads which share core's
+ * resources. Loading as much key material to registers allows to
+ * minimize references to shared memory interface, as well as amount
+ * of instructions in inner loops [much needed on T4]. But then having
+ * non-key-length specific routines would require conditional branches
+ * either in inner loops or on subroutines' entries. Former is hardly
+ * acceptable, while latter means code size increase to size occupied
+ * by multiple key-length specfic subroutines, so why fight?
+ */
+void aes128_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const AES_KEY *key,
+                           unsigned char *ivec);
+void aes128_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const AES_KEY *key,
+                           unsigned char *ivec);
+void aes192_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const AES_KEY *key,
+                           unsigned char *ivec);
+void aes192_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const AES_KEY *key,
+                           unsigned char *ivec);
+void aes256_t4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const AES_KEY *key,
+                           unsigned char *ivec);
+void aes256_t4_cbc_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const AES_KEY *key,
+                           unsigned char *ivec);
+void aes128_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t blocks, const AES_KEY *key,
+                             unsigned char *ivec);
+void aes192_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t blocks, const AES_KEY *key,
+                             unsigned char *ivec);
+void aes256_t4_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t blocks, const AES_KEY *key,
+                             unsigned char *ivec);
+void aes128_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t blocks, const AES_KEY *key1,
+                           const AES_KEY *key2, const unsigned char *ivec);
+void aes128_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t blocks, const AES_KEY *key1,
+                           const AES_KEY *key2, const unsigned char *ivec);
+void aes256_t4_xts_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t blocks, const AES_KEY *key1,
+                           const AES_KEY *key2, const unsigned char *ivec);
+void aes256_t4_xts_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t blocks, const AES_KEY *key1,
+                           const AES_KEY *key2, const unsigned char *ivec);
+
+static int aes_t4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                           const unsigned char *iv, int enc)
+{
+    int ret, mode, bits;
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    mode = ctx->cipher->flags & EVP_CIPH_MODE;
+    bits = ctx->key_len * 8;
+    if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+        && !enc) {
+        ret = 0;
+        aes_t4_set_decrypt_key(key, bits, ctx->cipher_data);
+        dat->block = (block128_f) aes_t4_decrypt;
+        switch (bits) {
+        case 128:
+            dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+                (cbc128_f) aes128_t4_cbc_decrypt : NULL;
+            break;
+        case 192:
+            dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+                (cbc128_f) aes192_t4_cbc_decrypt : NULL;
+            break;
+        case 256:
+            dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+                (cbc128_f) aes256_t4_cbc_decrypt : NULL;
+            break;
+        default:
+            ret = -1;
+        }
+    } else {
+        ret = 0;
+        aes_t4_set_encrypt_key(key, bits, ctx->cipher_data);
+        dat->block = (block128_f) aes_t4_encrypt;
+        switch (bits) {
+        case 128:
+            if (mode == EVP_CIPH_CBC_MODE)
+                dat->stream.cbc = (cbc128_f) aes128_t4_cbc_encrypt;
+            else if (mode == EVP_CIPH_CTR_MODE)
+                dat->stream.ctr = (ctr128_f) aes128_t4_ctr32_encrypt;
+            else
+                dat->stream.cbc = NULL;
+            break;
+        case 192:
+            if (mode == EVP_CIPH_CBC_MODE)
+                dat->stream.cbc = (cbc128_f) aes192_t4_cbc_encrypt;
+            else if (mode == EVP_CIPH_CTR_MODE)
+                dat->stream.ctr = (ctr128_f) aes192_t4_ctr32_encrypt;
+            else
+                dat->stream.cbc = NULL;
+            break;
+        case 256:
+            if (mode == EVP_CIPH_CBC_MODE)
+                dat->stream.cbc = (cbc128_f) aes256_t4_cbc_encrypt;
+            else if (mode == EVP_CIPH_CTR_MODE)
+                dat->stream.ctr = (ctr128_f) aes256_t4_ctr32_encrypt;
+            else
+                dat->stream.cbc = NULL;
+            break;
+        default:
+            ret = -1;
+        }
+    }
+
+    if (ret < 0) {
+        EVPerr(EVP_F_AES_T4_INIT_KEY, EVP_R_AES_KEY_SETUP_FAILED);
+        return 0;
+    }
+
+    return 1;
+}
+
+#  define aes_t4_cbc_cipher aes_cbc_cipher
+static int aes_t4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+#  define aes_t4_ecb_cipher aes_ecb_cipher
+static int aes_t4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+#  define aes_t4_ofb_cipher aes_ofb_cipher
+static int aes_t4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+#  define aes_t4_cfb_cipher aes_cfb_cipher
+static int aes_t4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+#  define aes_t4_cfb8_cipher aes_cfb8_cipher
+static int aes_t4_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                              const unsigned char *in, size_t len);
+
+#  define aes_t4_cfb1_cipher aes_cfb1_cipher
+static int aes_t4_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                              const unsigned char *in, size_t len);
+
+#  define aes_t4_ctr_cipher aes_ctr_cipher
+static int aes_t4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+static int aes_t4_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                               const unsigned char *iv, int enc)
+{
+    EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+    if (key) {
+        int bits = ctx->key_len * 8;
+        aes_t4_set_encrypt_key(key, bits, &gctx->ks.ks);
+        CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+                           (block128_f) aes_t4_encrypt);
+        switch (bits) {
+        case 128:
+            gctx->ctr = (ctr128_f) aes128_t4_ctr32_encrypt;
+            break;
+        case 192:
+            gctx->ctr = (ctr128_f) aes192_t4_ctr32_encrypt;
+            break;
+        case 256:
+            gctx->ctr = (ctr128_f) aes256_t4_ctr32_encrypt;
+            break;
+        default:
+            return 0;
+        }
+        /*
+         * If we have an iv can set it directly, otherwise use saved IV.
+         */
+        if (iv == NULL && gctx->iv_set)
+            iv = gctx->iv;
+        if (iv) {
+            CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+            gctx->iv_set = 1;
+        }
+        gctx->key_set = 1;
+    } else {
+        /* If key set use IV, otherwise copy */
+        if (gctx->key_set)
+            CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+        else
+            memcpy(gctx->iv, iv, gctx->ivlen);
+        gctx->iv_set = 1;
+        gctx->iv_gen = 0;
+    }
+    return 1;
+}
+
+#  define aes_t4_gcm_cipher aes_gcm_cipher
+static int aes_t4_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+static int aes_t4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                               const unsigned char *iv, int enc)
+{
+    EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+
+    if (key) {
+        int bits = ctx->key_len * 4;
+        xctx->stream = NULL;
+        /* key_len is two AES keys */
+        if (enc) {
+            aes_t4_set_encrypt_key(key, bits, &xctx->ks1.ks);
+            xctx->xts.block1 = (block128_f) aes_t4_encrypt;
+            switch (bits) {
+            case 128:
+                xctx->stream = aes128_t4_xts_encrypt;
+                break;
+#  if 0                         /* not yet */
+            case 192:
+                xctx->stream = aes192_t4_xts_encrypt;
+                break;
+#  endif
+            case 256:
+                xctx->stream = aes256_t4_xts_encrypt;
+                break;
+            default:
+                return 0;
+            }
+        } else {
+            aes_t4_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+            xctx->xts.block1 = (block128_f) aes_t4_decrypt;
+            switch (bits) {
+            case 128:
+                xctx->stream = aes128_t4_xts_decrypt;
+                break;
+#  if 0                         /* not yet */
+            case 192:
+                xctx->stream = aes192_t4_xts_decrypt;
+                break;
+#  endif
+            case 256:
+                xctx->stream = aes256_t4_xts_decrypt;
+                break;
+            default:
+                return 0;
+            }
+        }
+
+        aes_t4_set_encrypt_key(key + ctx->key_len / 2,
+                               ctx->key_len * 4, &xctx->ks2.ks);
+        xctx->xts.block2 = (block128_f) aes_t4_encrypt;
+
+        xctx->xts.key1 = &xctx->ks1;
+    }
+
+    if (iv) {
+        xctx->xts.key2 = &xctx->ks2;
+        memcpy(ctx->iv, iv, 16);
+    }
+
+    return 1;
+}
+
+#  define aes_t4_xts_cipher aes_xts_cipher
+static int aes_t4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+static int aes_t4_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                               const unsigned char *iv, int enc)
+{
+    EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+    if (key) {
+        int bits = ctx->key_len * 8;
+        aes_t4_set_encrypt_key(key, bits, &cctx->ks.ks);
+        CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+                           &cctx->ks, (block128_f) aes_t4_encrypt);
+#  if 0                         /* not yet */
+        switch (bits) {
+        case 128:
+            cctx->str = enc ? (ccm128_f) aes128_t4_ccm64_encrypt :
+                (ccm128_f) ae128_t4_ccm64_decrypt;
+            break;
+        case 192:
+            cctx->str = enc ? (ccm128_f) aes192_t4_ccm64_encrypt :
+                (ccm128_f) ae192_t4_ccm64_decrypt;
+            break;
+        case 256:
+            cctx->str = enc ? (ccm128_f) aes256_t4_ccm64_encrypt :
+                (ccm128_f) ae256_t4_ccm64_decrypt;
+            break;
+        default:
+            return 0;
+        }
+#  else
+        cctx->str = NULL;
+#  endif
+        cctx->key_set = 1;
+    }
+    if (iv) {
+        memcpy(ctx->iv, iv, 15 - cctx->L);
+        cctx->iv_set = 1;
+    }
+    return 1;
+}
+
+#  define aes_t4_ccm_cipher aes_ccm_cipher
+static int aes_t4_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                             const unsigned char *in, size_t len);
+
+#  define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
+        nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_t4_init_key,                \
+        aes_t4_##mode##_cipher,         \
+        NULL,                           \
+        sizeof(EVP_AES_KEY),            \
+        NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+        nid##_##keylen##_##nmode,blocksize,     \
+        keylen/8,ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_init_key,                   \
+        aes_##mode##_cipher,            \
+        NULL,                           \
+        sizeof(EVP_AES_KEY),            \
+        NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#  define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
+        nid##_##keylen##_##mode,blocksize, \
+        (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_t4_##mode##_init_key,       \
+        aes_t4_##mode##_cipher,         \
+        aes_##mode##_cleanup,           \
+        sizeof(EVP_AES_##MODE##_CTX),   \
+        NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+        nid##_##keylen##_##mode,blocksize, \
+        (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_##mode##_init_key,          \
+        aes_##mode##_cipher,            \
+        aes_##mode##_cleanup,           \
+        sizeof(EVP_AES_##MODE##_CTX),   \
+        NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+# else
+
+#  define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+        nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_init_key,                   \
+        aes_##mode##_cipher,            \
+        NULL,                           \
+        sizeof(EVP_AES_KEY),            \
+        NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+#  define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+        nid##_##keylen##_##mode,blocksize, \
+        (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        aes_##mode##_init_key,          \
+        aes_##mode##_cipher,            \
+        aes_##mode##_cleanup,           \
+        sizeof(EVP_AES_##MODE##_CTX),   \
+        NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+# endif
+
+# if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+#  include "arm_arch.h"
+#  if __ARM_MAX_ARCH__>=7
+#   if defined(BSAES_ASM)
+#    define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
+#   endif
+#   define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
+#   define HWAES_set_encrypt_key aes_v8_set_encrypt_key
+#   define HWAES_set_decrypt_key aes_v8_set_decrypt_key
+#   define HWAES_encrypt aes_v8_encrypt
+#   define HWAES_decrypt aes_v8_decrypt
+#   define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+#   define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
+#  endif
+# endif
+
+# if defined(HWAES_CAPABLE)
+int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                          AES_KEY *key);
+int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                          AES_KEY *key);
+void HWAES_encrypt(const unsigned char *in, unsigned char *out,
+                   const AES_KEY *key);
+void HWAES_decrypt(const unsigned char *in, unsigned char *out,
+                   const AES_KEY *key);
+void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const AES_KEY *key,
+                       unsigned char *ivec, const int enc);
+void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                size_t len, const AES_KEY *key,
+                                const unsigned char ivec[16]);
+# endif
+
+# define BLOCK_CIPHER_generic_pack(nid,keylen,flags)             \
+        BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)     \
+        BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)      \
+        BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
+        BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
+        BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags)       \
+        BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags)       \
+        BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
+
+static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+{
+    int ret, mode;
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    mode = ctx->cipher->flags & EVP_CIPH_MODE;
+    if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+        && !enc)
+# ifdef HWAES_CAPABLE
+        if (HWAES_CAPABLE) {
+            ret = HWAES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+            dat->block = (block128_f) HWAES_decrypt;
+            dat->stream.cbc = NULL;
+#  ifdef HWAES_cbc_encrypt
+            if (mode == EVP_CIPH_CBC_MODE)
+                dat->stream.cbc = (cbc128_f) HWAES_cbc_encrypt;
+#  endif
+        } else
+# endif
+# ifdef BSAES_CAPABLE
+        if (BSAES_CAPABLE && mode == EVP_CIPH_CBC_MODE) {
+            ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+            dat->block = (block128_f) AES_decrypt;
+            dat->stream.cbc = (cbc128_f) bsaes_cbc_encrypt;
+        } else
+# endif
+# ifdef VPAES_CAPABLE
+        if (VPAES_CAPABLE) {
+            ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+            dat->block = (block128_f) vpaes_decrypt;
+            dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+                (cbc128_f) vpaes_cbc_encrypt : NULL;
+        } else
+# endif
+        {
+            ret = AES_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+            dat->block = (block128_f) AES_decrypt;
+            dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+                (cbc128_f) AES_cbc_encrypt : NULL;
+    } else
+# ifdef HWAES_CAPABLE
+    if (HWAES_CAPABLE) {
+        ret = HWAES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+        dat->block = (block128_f) HWAES_encrypt;
+        dat->stream.cbc = NULL;
+#  ifdef HWAES_cbc_encrypt
+        if (mode == EVP_CIPH_CBC_MODE)
+            dat->stream.cbc = (cbc128_f) HWAES_cbc_encrypt;
+        else
+#  endif
+#  ifdef HWAES_ctr32_encrypt_blocks
+        if (mode == EVP_CIPH_CTR_MODE)
+            dat->stream.ctr = (ctr128_f) HWAES_ctr32_encrypt_blocks;
+        else
+#  endif
+            (void)0;            /* terminate potentially open 'else' */
+    } else
+# endif
+# ifdef BSAES_CAPABLE
+    if (BSAES_CAPABLE && mode == EVP_CIPH_CTR_MODE) {
+        ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+        dat->block = (block128_f) AES_encrypt;
+        dat->stream.ctr = (ctr128_f) bsaes_ctr32_encrypt_blocks;
+    } else
+# endif
+# ifdef VPAES_CAPABLE
+    if (VPAES_CAPABLE) {
+        ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+        dat->block = (block128_f) vpaes_encrypt;
+        dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+            (cbc128_f) vpaes_cbc_encrypt : NULL;
+    } else
+# endif
+    {
+        ret = AES_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
+        dat->block = (block128_f) AES_encrypt;
+        dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ?
+            (cbc128_f) AES_cbc_encrypt : NULL;
+# ifdef AES_CTR_ASM
+        if (mode == EVP_CIPH_CTR_MODE)
+            dat->stream.ctr = (ctr128_f) AES_ctr32_encrypt;
+# endif
+    }
+
+    if (ret < 0) {
+        EVPerr(EVP_F_AES_INIT_KEY, EVP_R_AES_KEY_SETUP_FAILED);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    size_t	bl = ctx->cipher->block_size;
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    if (len < bl)
+        return 1;
+
+    if (dat->stream.cbc)
+        (*dat->stream.cbc) (in, out, len, &dat->ks, ctx->iv, ctx->encrypt);
+    else if (ctx->encrypt)
+        CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
+    else
+        CRYPTO_cbc128_decrypt(in, out, len, &dat->ks, ctx->iv, dat->block);
+
+    return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    size_t bl = ctx->cipher->block_size;
+    size_t i;
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    if (len < bl)
+        return 1;
+
+    for (i = 0, len -= bl; i <= len; i += bl)
+        (*dat->block) (in + i, out + i, &dat->ks);
+
+    return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
+                          ctx->iv, &ctx->num, dat->block);
+    return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
+                          ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+    return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                           const unsigned char *in, size_t len)
+{
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    CRYPTO_cfb128_8_encrypt(in, out, len, &dat->ks,
+                            ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+    return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                           const unsigned char *in, size_t len)
+{
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    if (ctx->flags & EVP_CIPH_FLAG_LENGTH_BITS) {
+        CRYPTO_cfb128_1_encrypt(in, out, len, &dat->ks,
+                                ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+        return 1;
+    }
+
+    while (len >= MAXBITCHUNK) {
+        CRYPTO_cfb128_1_encrypt(in, out, MAXBITCHUNK * 8, &dat->ks,
+                                ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+        len -= MAXBITCHUNK;
+    }
+    if (len)
+        CRYPTO_cfb128_1_encrypt(in, out, len * 8, &dat->ks,
+                                ctx->iv, &ctx->num, ctx->encrypt, dat->block);
+
+    return 1;
+}
+
+static int aes_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    unsigned int num = ctx->num;
+    EVP_AES_KEY *dat = (EVP_AES_KEY *) ctx->cipher_data;
+
+    if (dat->stream.ctr)
+        CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
+                                    ctx->iv, ctx->buf, &num, dat->stream.ctr);
+    else
+        CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
+                              ctx->iv, ctx->buf, &num, dat->block);
+    ctx->num = (size_t)num;
+    return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes, 128, EVP_CIPH_FLAG_FIPS)
+    BLOCK_CIPHER_generic_pack(NID_aes, 192, EVP_CIPH_FLAG_FIPS)
+    BLOCK_CIPHER_generic_pack(NID_aes, 256, EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+{
+    EVP_AES_GCM_CTX *gctx = c->cipher_data;
+    OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+    if (gctx->iv != c->iv)
+        OPENSSL_free(gctx->iv);
+    return 1;
+}
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter)
+{
+    int n = 8;
+    unsigned char c;
+
+    do {
+        --n;
+        c = counter[n];
+        ++c;
+        counter[n] = c;
+        if (c)
+            return;
+    } while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+    EVP_AES_GCM_CTX *gctx = c->cipher_data;
+    switch (type) {
+    case EVP_CTRL_INIT:
+        gctx->key_set = 0;
+        gctx->iv_set = 0;
+        gctx->ivlen = c->cipher->iv_len;
+        gctx->iv = c->iv;
+        gctx->taglen = -1;
+        gctx->iv_gen = 0;
+        gctx->tls_aad_len = -1;
+        return 1;
+
+    case EVP_CTRL_GCM_SET_IVLEN:
+        if (arg <= 0)
+            return 0;
+        /* Allocate memory for IV if needed */
+        if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) {
+            if (gctx->iv != c->iv)
+                OPENSSL_free(gctx->iv);
+            gctx->iv = OPENSSL_malloc(arg);
+            if (!gctx->iv)
+                return 0;
+        }
+        gctx->ivlen = arg;
+        return 1;
+
+    case EVP_CTRL_GCM_SET_TAG:
+        if (arg <= 0 || arg > 16 || c->encrypt)
+            return 0;
+        memcpy(c->buf, ptr, arg);
+        gctx->taglen = arg;
+        return 1;
+
+    case EVP_CTRL_GCM_GET_TAG:
+        if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+            return 0;
+        memcpy(ptr, c->buf, arg);
+        return 1;
+
+    case EVP_CTRL_GCM_SET_IV_FIXED:
+        /* Special case: -1 length restores whole IV */
+        if (arg == -1) {
+            memcpy(gctx->iv, ptr, gctx->ivlen);
+            gctx->iv_gen = 1;
+            return 1;
+        }
+        /*
+         * Fixed field must be at least 4 bytes and invocation field at least
+         * 8.
+         */
+        if ((arg < 4) || (gctx->ivlen - arg) < 8)
+            return 0;
+        if (arg)
+            memcpy(gctx->iv, ptr, arg);
+        if (c->encrypt && RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+            return 0;
+        gctx->iv_gen = 1;
+        return 1;
+
+    case EVP_CTRL_GCM_IV_GEN:
+        if (gctx->iv_gen == 0 || gctx->key_set == 0)
+            return 0;
+        CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+        if (arg <= 0 || arg > gctx->ivlen)
+            arg = gctx->ivlen;
+        memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+        /*
+         * Invocation field will be at least 8 bytes in size and so no need
+         * to check wrap around or increment more than last 8 bytes.
+         */
+        ctr64_inc(gctx->iv + gctx->ivlen - 8);
+        gctx->iv_set = 1;
+        return 1;
+
+    case EVP_CTRL_GCM_SET_IV_INV:
+        if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+            return 0;
+        memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+        CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+        gctx->iv_set = 1;
+        return 1;
+
+    case EVP_CTRL_AEAD_TLS1_AAD:
+        /* Save the AAD for later use */
+        if (arg != 13)
+            return 0;
+        memcpy(c->buf, ptr, arg);
+        gctx->tls_aad_len = arg;
+        {
+            unsigned int len = c->buf[arg - 2] << 8 | c->buf[arg - 1];
+            /* Correct length for explicit IV */
+            len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+            /* If decrypting correct for tag too */
+            if (!c->encrypt)
+                len -= EVP_GCM_TLS_TAG_LEN;
+            c->buf[arg - 2] = len >> 8;
+            c->buf[arg - 1] = len & 0xff;
+        }
+        /* Extra padding: tag appended to record */
+        return EVP_GCM_TLS_TAG_LEN;
+
+    case EVP_CTRL_COPY:
+        {
+            EVP_CIPHER_CTX *out = ptr;
+            EVP_AES_GCM_CTX *gctx_out = out->cipher_data;
+            if (gctx->gcm.key) {
+                if (gctx->gcm.key != &gctx->ks)
+                    return 0;
+                gctx_out->gcm.key = &gctx_out->ks;
+            }
+            if (gctx->iv == c->iv)
+                gctx_out->iv = out->iv;
+            else {
+                gctx_out->iv = OPENSSL_malloc(gctx->ivlen);
+                if (!gctx_out->iv)
+                    return 0;
+                memcpy(gctx_out->iv, gctx->iv, gctx->ivlen);
+            }
+            return 1;
+        }
+
+    default:
+        return -1;
+
+    }
+}
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                            const unsigned char *iv, int enc)
+{
+    EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+    if (key) {
+        do {
+# ifdef HWAES_CAPABLE
+            if (HWAES_CAPABLE) {
+                HWAES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+                CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+                                   (block128_f) HWAES_encrypt);
+#  ifdef HWAES_ctr32_encrypt_blocks
+                gctx->ctr = (ctr128_f) HWAES_ctr32_encrypt_blocks;
+#  else
+                gctx->ctr = NULL;
+#  endif
+                break;
+            } else
+# endif
+# ifdef BSAES_CAPABLE
+            if (BSAES_CAPABLE) {
+                AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+                CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+                                   (block128_f) AES_encrypt);
+                gctx->ctr = (ctr128_f) bsaes_ctr32_encrypt_blocks;
+                break;
+            } else
+# endif
+# ifdef VPAES_CAPABLE
+            if (VPAES_CAPABLE) {
+                vpaes_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+                CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+                                   (block128_f) vpaes_encrypt);
+                gctx->ctr = NULL;
+                break;
+            } else
+# endif
+                (void)0;        /* terminate potentially open 'else' */
+
+            AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+            CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+                               (block128_f) AES_encrypt);
+# ifdef AES_CTR_ASM
+            gctx->ctr = (ctr128_f) AES_ctr32_encrypt;
+# else
+            gctx->ctr = NULL;
+# endif
+        } while (0);
+
+        /*
+         * If we have an iv can set it directly, otherwise use saved IV.
+         */
+        if (iv == NULL && gctx->iv_set)
+            iv = gctx->iv;
+        if (iv) {
+            CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+            gctx->iv_set = 1;
+        }
+        gctx->key_set = 1;
+    } else {
+        /* If key set use IV, otherwise copy */
+        if (gctx->key_set)
+            CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+        else
+            memcpy(gctx->iv, iv, gctx->ivlen);
+        gctx->iv_set = 1;
+        gctx->iv_gen = 0;
+    }
+    return 1;
+}
+
+/*
+ * Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                              const unsigned char *in, size_t len)
+{
+    EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+    int rv = -1;
+    /* Encrypt/decrypt must be performed in place */
+    if (out != in
+        || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN))
+        return -1;
+    /*
+     * Set IV from start of buffer or generate IV and write to start of
+     * buffer.
+     */
+    if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+                            EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+                            EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+        goto err;
+    /* Use saved AAD */
+    if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+        goto err;
+    /* Fix buffer and length to point to payload */
+    in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+    out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+    len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+    if (ctx->encrypt) {
+        /* Encrypt payload */
+        if (gctx->ctr) {
+            size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+            if (len >= 32 && AES_GCM_ASM(gctx)) {
+                if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+                    return -1;
+
+                bulk = AES_gcm_encrypt(in, out, len,
+                                       gctx->gcm.key,
+                                       gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+                gctx->gcm.len.u[1] += bulk;
+            }
+# endif
+            if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+                                            in + bulk,
+                                            out + bulk,
+                                            len - bulk, gctx->ctr))
+                goto err;
+        } else {
+            size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+            if (len >= 32 && AES_GCM_ASM2(gctx)) {
+                if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
+                    return -1;
+
+                bulk = AES_gcm_encrypt(in, out, len,
+                                       gctx->gcm.key,
+                                       gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+                gctx->gcm.len.u[1] += bulk;
+            }
+# endif
+            if (CRYPTO_gcm128_encrypt(&gctx->gcm,
+                                      in + bulk, out + bulk, len - bulk))
+                goto err;
+        }
+        out += len;
+        /* Finally write tag */
+        CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+        rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+    } else {
+        /* Decrypt */
+        if (gctx->ctr) {
+            size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+            if (len >= 16 && AES_GCM_ASM(gctx)) {
+                if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+                    return -1;
+
+                bulk = AES_gcm_decrypt(in, out, len,
+                                       gctx->gcm.key,
+                                       gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+                gctx->gcm.len.u[1] += bulk;
+            }
+# endif
+            if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+                                            in + bulk,
+                                            out + bulk,
+                                            len - bulk, gctx->ctr))
+                goto err;
+        } else {
+            size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+            if (len >= 16 && AES_GCM_ASM2(gctx)) {
+                if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
+                    return -1;
+
+                bulk = AES_gcm_decrypt(in, out, len,
+                                       gctx->gcm.key,
+                                       gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+                gctx->gcm.len.u[1] += bulk;
+            }
+# endif
+            if (CRYPTO_gcm128_decrypt(&gctx->gcm,
+                                      in + bulk, out + bulk, len - bulk))
+                goto err;
+        }
+        /* Retrieve tag */
+        CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, EVP_GCM_TLS_TAG_LEN);
+        /* If tag mismatch wipe buffer */
+        if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN)) {
+            OPENSSL_cleanse(out, len);
+            goto err;
+        }
+        rv = len;
+    }
+
+ err:
+    gctx->iv_set = 0;
+    gctx->tls_aad_len = -1;
+    return rv;
+}
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+    /* If not set up, return error */
+    if (!gctx->key_set)
+        return -1;
+
+    if (gctx->tls_aad_len >= 0)
+        return aes_gcm_tls_cipher(ctx, out, in, len);
+
+    if (!gctx->iv_set)
+        return -1;
+    if (in) {
+        if (out == NULL) {
+            if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+                return -1;
+        } else if (ctx->encrypt) {
+            if (gctx->ctr) {
+                size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+                if (len >= 32 && AES_GCM_ASM(gctx)) {
+                    size_t res = (16 - gctx->gcm.mres) % 16;
+
+                    if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+                        return -1;
+
+                    bulk = AES_gcm_encrypt(in + res,
+                                           out + res, len - res,
+                                           gctx->gcm.key, gctx->gcm.Yi.c,
+                                           gctx->gcm.Xi.u);
+                    gctx->gcm.len.u[1] += bulk;
+                    bulk += res;
+                }
+# endif
+                if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+                                                in + bulk,
+                                                out + bulk,
+                                                len - bulk, gctx->ctr))
+                    return -1;
+            } else {
+                size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+                if (len >= 32 && AES_GCM_ASM2(gctx)) {
+                    size_t res = (16 - gctx->gcm.mres) % 16;
+
+                    if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
+                        return -1;
+
+                    bulk = AES_gcm_encrypt(in + res,
+                                           out + res, len - res,
+                                           gctx->gcm.key, gctx->gcm.Yi.c,
+                                           gctx->gcm.Xi.u);
+                    gctx->gcm.len.u[1] += bulk;
+                    bulk += res;
+                }
+# endif
+                if (CRYPTO_gcm128_encrypt(&gctx->gcm,
+                                          in + bulk, out + bulk, len - bulk))
+                    return -1;
+            }
+        } else {
+            if (gctx->ctr) {
+                size_t bulk = 0;
+# if defined(AES_GCM_ASM)
+                if (len >= 16 && AES_GCM_ASM(gctx)) {
+                    size_t res = (16 - gctx->gcm.mres) % 16;
+
+                    if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+                        return -1;
+
+                    bulk = AES_gcm_decrypt(in + res,
+                                           out + res, len - res,
+                                           gctx->gcm.key,
+                                           gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+                    gctx->gcm.len.u[1] += bulk;
+                    bulk += res;
+                }
+# endif
+                if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+                                                in + bulk,
+                                                out + bulk,
+                                                len - bulk, gctx->ctr))
+                    return -1;
+            } else {
+                size_t bulk = 0;
+# if defined(AES_GCM_ASM2)
+                if (len >= 16 && AES_GCM_ASM2(gctx)) {
+                    size_t res = (16 - gctx->gcm.mres) % 16;
+
+                    if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
+                        return -1;
+
+                    bulk = AES_gcm_decrypt(in + res,
+                                           out + res, len - res,
+                                           gctx->gcm.key,
+                                           gctx->gcm.Yi.c, gctx->gcm.Xi.u);
+                    gctx->gcm.len.u[1] += bulk;
+                    bulk += res;
+                }
+# endif
+                if (CRYPTO_gcm128_decrypt(&gctx->gcm,
+                                          in + bulk, out + bulk, len - bulk))
+                    return -1;
+            }
+        }
+        return len;
+    } else {
+        if (!ctx->encrypt) {
+            if (gctx->taglen < 0)
+                return -1;
+            if (CRYPTO_gcm128_finish(&gctx->gcm, ctx->buf, gctx->taglen) != 0)
+                return -1;
+            gctx->iv_set = 0;
+            return 0;
+        }
+        CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+        gctx->taglen = 16;
+        /* Don't reuse the IV */
+        gctx->iv_set = 0;
+        return 0;
+    }
+
+}
+
+# define CUSTOM_FLAGS    (EVP_CIPH_FLAG_DEFAULT_ASN1 \
+                | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+                | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
+                | EVP_CIPH_CUSTOM_COPY)
+
+BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, gcm, GCM,
+                    EVP_CIPH_FLAG_FIPS | EVP_CIPH_FLAG_AEAD_CIPHER |
+                    CUSTOM_FLAGS)
+    BLOCK_CIPHER_custom(NID_aes, 192, 1, 12, gcm, GCM,
+                    EVP_CIPH_FLAG_FIPS | EVP_CIPH_FLAG_AEAD_CIPHER |
+                    CUSTOM_FLAGS)
+    BLOCK_CIPHER_custom(NID_aes, 256, 1, 12, gcm, GCM,
+                    EVP_CIPH_FLAG_FIPS | EVP_CIPH_FLAG_AEAD_CIPHER |
+                    CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+    EVP_AES_XTS_CTX *xctx = c->cipher_data;
+    if (type == EVP_CTRL_COPY) {
+        EVP_CIPHER_CTX *out = ptr;
+        EVP_AES_XTS_CTX *xctx_out = out->cipher_data;
+        if (xctx->xts.key1) {
+            if (xctx->xts.key1 != &xctx->ks1)
+                return 0;
+            xctx_out->xts.key1 = &xctx_out->ks1;
+        }
+        if (xctx->xts.key2) {
+            if (xctx->xts.key2 != &xctx->ks2)
+                return 0;
+            xctx_out->xts.key2 = &xctx_out->ks2;
+        }
+        return 1;
+    } else if (type != EVP_CTRL_INIT)
+        return -1;
+    /* key1 and key2 are used as an indicator both key and IV are set */
+    xctx->xts.key1 = NULL;
+    xctx->xts.key2 = NULL;
+    return 1;
+}
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                            const unsigned char *iv, int enc)
+{
+    EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+
+    if (key)
+        do {
+# ifdef AES_XTS_ASM
+            xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+# else
+            xctx->stream = NULL;
+# endif
+            /* key_len is two AES keys */
+# ifdef HWAES_CAPABLE
+            if (HWAES_CAPABLE) {
+                if (enc) {
+                    HWAES_set_encrypt_key(key, ctx->key_len * 4,
+                                          &xctx->ks1.ks);
+                    xctx->xts.block1 = (block128_f) HWAES_encrypt;
+                } else {
+                    HWAES_set_decrypt_key(key, ctx->key_len * 4,
+                                          &xctx->ks1.ks);
+                    xctx->xts.block1 = (block128_f) HWAES_decrypt;
+                }
+
+                HWAES_set_encrypt_key(key + ctx->key_len / 2,
+                                      ctx->key_len * 4, &xctx->ks2.ks);
+                xctx->xts.block2 = (block128_f) HWAES_encrypt;
+
+                xctx->xts.key1 = &xctx->ks1;
+                break;
+            } else
+# endif
+# ifdef BSAES_CAPABLE
+            if (BSAES_CAPABLE)
+                xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+            else
+# endif
+# ifdef VPAES_CAPABLE
+            if (VPAES_CAPABLE) {
+                if (enc) {
+                    vpaes_set_encrypt_key(key, ctx->key_len * 4,
+                                          &xctx->ks1.ks);
+                    xctx->xts.block1 = (block128_f) vpaes_encrypt;
+                } else {
+                    vpaes_set_decrypt_key(key, ctx->key_len * 4,
+                                          &xctx->ks1.ks);
+                    xctx->xts.block1 = (block128_f) vpaes_decrypt;
+                }
+
+                vpaes_set_encrypt_key(key + ctx->key_len / 2,
+                                      ctx->key_len * 4, &xctx->ks2.ks);
+                xctx->xts.block2 = (block128_f) vpaes_encrypt;
+
+                xctx->xts.key1 = &xctx->ks1;
+                break;
+            } else
+# endif
+                (void)0;        /* terminate potentially open 'else' */
+
+            if (enc) {
+                AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+                xctx->xts.block1 = (block128_f) AES_encrypt;
+            } else {
+                AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+                xctx->xts.block1 = (block128_f) AES_decrypt;
+            }
+
+            AES_set_encrypt_key(key + ctx->key_len / 2,
+                                ctx->key_len * 4, &xctx->ks2.ks);
+            xctx->xts.block2 = (block128_f) AES_encrypt;
+
+            xctx->xts.key1 = &xctx->ks1;
+        } while (0);
+
+    if (iv) {
+        xctx->xts.key2 = &xctx->ks2;
+        memcpy(ctx->iv, iv, 16);
+    }
+
+    return 1;
+}
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+    if (!xctx->xts.key1 || !xctx->xts.key2)
+        return 0;
+    if (!out || !in || len < AES_BLOCK_SIZE)
+        return 0;
+    if (xctx->stream)
+        (*xctx->stream) (in, out, len,
+                         xctx->xts.key1, xctx->xts.key2, ctx->iv);
+    else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+                                   ctx->encrypt))
+        return 0;
+    return 1;
+}
+
+# define aes_xts_cleanup NULL
+
+# define XTS_FLAGS       (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+                         | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
+                         | EVP_CIPH_CUSTOM_COPY)
+
+BLOCK_CIPHER_custom(NID_aes, 128, 1, 16, xts, XTS,
+                    EVP_CIPH_FLAG_FIPS | XTS_FLAGS)
+    BLOCK_CIPHER_custom(NID_aes, 256, 1, 16, xts, XTS,
+                    EVP_CIPH_FLAG_FIPS | XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+    EVP_AES_CCM_CTX *cctx = c->cipher_data;
+    switch (type) {
+    case EVP_CTRL_INIT:
+        cctx->key_set = 0;
+        cctx->iv_set = 0;
+        cctx->L = 8;
+        cctx->M = 12;
+        cctx->tag_set = 0;
+        cctx->len_set = 0;
+        return 1;
+
+    case EVP_CTRL_CCM_SET_IVLEN:
+        arg = 15 - arg;
+    case EVP_CTRL_CCM_SET_L:
+        if (arg < 2 || arg > 8)
+            return 0;
+        cctx->L = arg;
+        return 1;
+
+    case EVP_CTRL_CCM_SET_TAG:
+        if ((arg & 1) || arg < 4 || arg > 16)
+            return 0;
+        if (c->encrypt && ptr)
+            return 0;
+        if (ptr) {
+            cctx->tag_set = 1;
+            memcpy(c->buf, ptr, arg);
+        }
+        cctx->M = arg;
+        return 1;
+
+    case EVP_CTRL_CCM_GET_TAG:
+        if (!c->encrypt || !cctx->tag_set)
+            return 0;
+        if (!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+            return 0;
+        cctx->tag_set = 0;
+        cctx->iv_set = 0;
+        cctx->len_set = 0;
+        return 1;
+
+    case EVP_CTRL_COPY:
+        {
+            EVP_CIPHER_CTX *out = ptr;
+            EVP_AES_CCM_CTX *cctx_out = out->cipher_data;
+            if (cctx->ccm.key) {
+                if (cctx->ccm.key != &cctx->ks)
+                    return 0;
+                cctx_out->ccm.key = &cctx_out->ks;
+            }
+            return 1;
+        }
+
+    default:
+        return -1;
+
+    }
+}
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                            const unsigned char *iv, int enc)
+{
+    EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+    if (!iv && !key)
+        return 1;
+    if (key)
+        do {
+# ifdef HWAES_CAPABLE
+            if (HWAES_CAPABLE) {
+                HWAES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+
+                CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+                                   &cctx->ks, (block128_f) HWAES_encrypt);
+                cctx->str = NULL;
+                cctx->key_set = 1;
+                break;
+            } else
+# endif
+# ifdef VPAES_CAPABLE
+            if (VPAES_CAPABLE) {
+                vpaes_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+                CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+                                   &cctx->ks, (block128_f) vpaes_encrypt);
+                cctx->str = NULL;
+                cctx->key_set = 1;
+                break;
+            }
+# endif
+            AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+            CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+                               &cctx->ks, (block128_f) AES_encrypt);
+            cctx->str = NULL;
+            cctx->key_set = 1;
+        } while (0);
+    if (iv) {
+        memcpy(ctx->iv, iv, 15 - cctx->L);
+        cctx->iv_set = 1;
+    }
+    return 1;
+}
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
+{
+    EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+    CCM128_CONTEXT *ccm = &cctx->ccm;
+    /* If not set up, return error */
+    if (!cctx->iv_set && !cctx->key_set)
+        return -1;
+    if (!ctx->encrypt && !cctx->tag_set)
+        return -1;
+    if (!out) {
+        if (!in) {
+            if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+                return -1;
+            cctx->len_set = 1;
+            return len;
+        }
+        /* If have AAD need message length */
+        if (!cctx->len_set && len)
+            return -1;
+        CRYPTO_ccm128_aad(ccm, in, len);
+        return len;
+    }
+    /* EVP_*Final() doesn't return any data */
+    if (!in)
+        return 0;
+    /* If not set length yet do it */
+    if (!cctx->len_set) {
+        if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+            return -1;
+        cctx->len_set = 1;
+    }
+    if (ctx->encrypt) {
+        if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+                                                    cctx->str) :
+            CRYPTO_ccm128_encrypt(ccm, in, out, len))
+            return -1;
+        cctx->tag_set = 1;
+        return len;
+    } else {
+        int rv = -1;
+        if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+                                                     cctx->str) :
+            !CRYPTO_ccm128_decrypt(ccm, in, out, len)) {
+            unsigned char tag[16];
+            if (CRYPTO_ccm128_tag(ccm, tag, cctx->M)) {
+                if (!memcmp(tag, ctx->buf, cctx->M))
+                    rv = len;
+            }
+        }
+        if (rv == -1)
+            OPENSSL_cleanse(out, len);
+        cctx->iv_set = 0;
+        cctx->tag_set = 0;
+        cctx->len_set = 0;
+        return rv;
+    }
+
+}
+
+# define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, ccm, CCM,
+                    EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
+    BLOCK_CIPHER_custom(NID_aes, 192, 1, 12, ccm, CCM,
+                    EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
+    BLOCK_CIPHER_custom(NID_aes, 256, 1, 12, ccm, CCM,
+                    EVP_CIPH_FLAG_FIPS | CUSTOM_FLAGS)
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/e_des3.c	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,381 @@
+/* crypto/evp/e_des3.c */
+/* Copyright (C) 1995-1998 Eric Young ([email protected])
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young ([email protected]).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson ([email protected]).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young ([email protected])"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson ([email protected])"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#ifndef OPENSSL_NO_DES
+# include <openssl/evp.h>
+# include <openssl/objects.h>
+# include "evp_locl.h"
+# include <openssl/des.h>
+# include <openssl/rand.h>
+
+typedef struct {
+    union {
+        double align;
+        DES_key_schedule ks[3];
+    } ks;
+    union {
+        void (*cbc) (const void *, void *, size_t,
+                     const DES_key_schedule *, unsigned char *);
+    } stream;
+} DES_EDE_KEY;
+# define ks1 ks.ks[0]
+# define ks2 ks.ks[1]
+# define ks3 ks.ks[2]
+
+# if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
+/* ---------^^^ this is not a typo, just a way to detect that
+ * assembler support was in general requested... */
+#  include "sparc_arch.h"
+
+extern unsigned int OPENSSL_sparcv9cap_P[];
+
+#  define SPARC_DES_CAPABLE       (OPENSSL_sparcv9cap_P[1] & CFR_DES)
+
+void des_t4_key_expand(const void *key, DES_key_schedule *ks);
+void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len,
+                             const DES_key_schedule ks[3], unsigned char iv[8]);
+void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len,
+                             const DES_key_schedule ks[3], unsigned char iv[8]);
+# endif
+
+static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                            const unsigned char *iv, int enc);
+
+static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                             const unsigned char *iv, int enc);
+
+static int des3_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr);
+
+# define data(ctx) ((DES_EDE_KEY *)(ctx)->cipher_data)
+
+/*
+ * Because of various casts and different args can't use
+ * IMPLEMENT_BLOCK_CIPHER
+ */
+
+static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                              const unsigned char *in, size_t inl)
+{
+    BLOCK_CIPHER_ecb_loop()
+        DES_ecb3_encrypt((const_DES_cblock *)(in + i),
+                         (DES_cblock *)(out + i),
+                         &data(ctx)->ks1, &data(ctx)->ks2,
+                         &data(ctx)->ks3, ctx->encrypt);
+    return 1;
+}
+
+static int des_ede_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                              const unsigned char *in, size_t inl)
+{
+    while (inl >= EVP_MAXCHUNK) {
+        DES_ede3_ofb64_encrypt(in, out, (long)EVP_MAXCHUNK,
+                               &data(ctx)->ks1, &data(ctx)->ks2,
+                               &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+                               &ctx->num);
+        inl -= EVP_MAXCHUNK;
+        in += EVP_MAXCHUNK;
+        out += EVP_MAXCHUNK;
+    }
+    if (inl)
+        DES_ede3_ofb64_encrypt(in, out, (long)inl,
+                               &data(ctx)->ks1, &data(ctx)->ks2,
+                               &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+                               &ctx->num);
+
+    return 1;
+}
+
+static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                              const unsigned char *in, size_t inl)
+{
+    DES_EDE_KEY *dat = data(ctx);
+
+# ifdef KSSL_DEBUG
+    {
+        int i;
+        fprintf(stderr, "des_ede_cbc_cipher(ctx=%p, buflen=%d)\n", ctx,
+                ctx->buf_len);
+        fprintf(stderr, "\t iv= ");
+        for (i = 0; i < 8; i++)
+            fprintf(stderr, "%02X", ctx->iv[i]);
+        fprintf(stderr, "\n");
+    }
+# endif                         /* KSSL_DEBUG */
+    if (dat->stream.cbc) {
+        (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv);
+        return 1;
+    }
+
+    while (inl >= EVP_MAXCHUNK) {
+        DES_ede3_cbc_encrypt(in, out, (long)EVP_MAXCHUNK,
+                             &dat->ks1, &dat->ks2, &dat->ks3,
+                             (DES_cblock *)ctx->iv, ctx->encrypt);
+        inl -= EVP_MAXCHUNK;
+        in += EVP_MAXCHUNK;
+        out += EVP_MAXCHUNK;
+    }
+    if (inl)
+        DES_ede3_cbc_encrypt(in, out, (long)inl,
+                             &dat->ks1, &dat->ks2, &dat->ks3,
+                             (DES_cblock *)ctx->iv, ctx->encrypt);
+    return 1;
+}
+
+static int des_ede_cfb64_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                const unsigned char *in, size_t inl)
+{
+    while (inl >= EVP_MAXCHUNK) {
+        DES_ede3_cfb64_encrypt(in, out, (long)EVP_MAXCHUNK,
+                               &data(ctx)->ks1, &data(ctx)->ks2,
+                               &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+                               &ctx->num, ctx->encrypt);
+        inl -= EVP_MAXCHUNK;
+        in += EVP_MAXCHUNK;
+        out += EVP_MAXCHUNK;
+    }
+    if (inl)
+        DES_ede3_cfb64_encrypt(in, out, (long)inl,
+                               &data(ctx)->ks1, &data(ctx)->ks2,
+                               &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+                               &ctx->num, ctx->encrypt);
+    return 1;
+}
+
+/*
+ * Although we have a CFB-r implementation for 3-DES, it doesn't pack the
+ * right way, so wrap it here
+ */
+static int des_ede3_cfb1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                const unsigned char *in, size_t inl)
+{
+    size_t n;
+    unsigned char c[1], d[1];
+
+    for (n = 0; n < inl; ++n) {
+        c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+        DES_ede3_cfb_encrypt(c, d, 1, 1,
+                             &data(ctx)->ks1, &data(ctx)->ks2,
+                             &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+                             ctx->encrypt);
+        out[n / 8] = (out[n / 8] & ~(0x80 >> (unsigned int)(n % 8)))
+            | ((d[0] & 0x80) >> (unsigned int)(n % 8));
+    }
+
+    return 1;
+}
+
+static int des_ede3_cfb8_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                const unsigned char *in, size_t inl)
+{
+    while (inl >= EVP_MAXCHUNK) {
+        DES_ede3_cfb_encrypt(in, out, 8, (long)EVP_MAXCHUNK,
+                             &data(ctx)->ks1, &data(ctx)->ks2,
+                             &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+                             ctx->encrypt);
+        inl -= EVP_MAXCHUNK;
+        in += EVP_MAXCHUNK;
+        out += EVP_MAXCHUNK;
+    }
+    if (inl)
+        DES_ede3_cfb_encrypt(in, out, 8, (long)inl,
+                             &data(ctx)->ks1, &data(ctx)->ks2,
+                             &data(ctx)->ks3, (DES_cblock *)ctx->iv,
+                             ctx->encrypt);
+    return 1;
+}
+
+BLOCK_CIPHER_defs(des_ede, DES_EDE_KEY, NID_des_ede, 8, 16, 8, 64,
+                  EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_DEFAULT_ASN1,
+                  des_ede_init_key, NULL, NULL, NULL, des3_ctrl)
+# define des_ede3_cfb64_cipher des_ede_cfb64_cipher
+# define des_ede3_ofb_cipher des_ede_ofb_cipher
+# define des_ede3_cbc_cipher des_ede_cbc_cipher
+# define des_ede3_ecb_cipher des_ede_ecb_cipher
+    BLOCK_CIPHER_defs(des_ede3, DES_EDE_KEY, NID_des_ede3, 8, 24, 8, 64,
+                  EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+                  EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL, NULL,
+                  des3_ctrl)
+
+    BLOCK_CIPHER_def_cfb(des_ede3, DES_EDE_KEY, NID_des_ede3, 24, 8, 1,
+                     EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+                     EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL,
+                     NULL, des3_ctrl)
+
+    BLOCK_CIPHER_def_cfb(des_ede3, DES_EDE_KEY, NID_des_ede3, 24, 8, 8,
+                     EVP_CIPH_RAND_KEY | EVP_CIPH_FLAG_FIPS |
+                     EVP_CIPH_FLAG_DEFAULT_ASN1, des_ede3_init_key, NULL, NULL,
+                     NULL, des3_ctrl)
+
+static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                            const unsigned char *iv, int enc)
+{
+    DES_cblock *deskey = (DES_cblock *)key;
+    DES_EDE_KEY *dat = data(ctx);
+
+    dat->stream.cbc = NULL;
+# if defined(SPARC_DES_CAPABLE)
+    if (SPARC_DES_CAPABLE) {
+        int mode = ctx->cipher->flags & EVP_CIPH_MODE;
+
+        if (mode == EVP_CIPH_CBC_MODE) {
+            des_t4_key_expand(&deskey[0], &dat->ks1);
+            des_t4_key_expand(&deskey[1], &dat->ks2);
+            memcpy(&dat->ks3, &dat->ks1, sizeof(dat->ks1));
+            dat->stream.cbc = enc ? des_t4_ede3_cbc_encrypt :
+                des_t4_ede3_cbc_decrypt;
+            return 1;
+        }
+    }
+# endif
+# ifdef EVP_CHECK_DES_KEY
+    if (DES_set_key_checked(&deskey[0], &dat->ks1)
+        || DES_set_key_checked(&deskey[1], &dat->ks2))
+        return 0;
+# else
+    DES_set_key_unchecked(&deskey[0], &dat->ks1);
+    DES_set_key_unchecked(&deskey[1], &dat->ks2);
+# endif
+    memcpy(&dat->ks3, &dat->ks1, sizeof(dat->ks1));
+    return 1;
+}
+
+static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                             const unsigned char *iv, int enc)
+{
+    DES_cblock *deskey = (DES_cblock *)key;
+    DES_EDE_KEY *dat = data(ctx);
+
+# ifdef KSSL_DEBUG
+    {
+        int i;
+        fprintf(stderr, "des_ede3_init_key(ctx=%p)\n", ctx);
+        fprintf(stderr, "\tKEY= ");
+        for (i = 0; i < 24; i++)
+            fprintf(stderr, "%02X", key[i]);
+        fprintf(stderr, "\n");
+        if (iv) {
+            fprintf(stderr, "\t IV= ");
+            for (i = 0; i < 8; i++)
+                fprintf(stderr, "%02X", iv[i]);
+            fprintf(stderr, "\n");
+        }
+    }
+# endif                         /* KSSL_DEBUG */
+
+    dat->stream.cbc = NULL;
+# if defined(SPARC_DES_CAPABLE)
+    if (SPARC_DES_CAPABLE) {
+        int mode = ctx->cipher->flags & EVP_CIPH_MODE;
+
+        if (mode == EVP_CIPH_CBC_MODE) {
+            des_t4_key_expand(&deskey[0], &dat->ks1);
+            des_t4_key_expand(&deskey[1], &dat->ks2);
+            des_t4_key_expand(&deskey[2], &dat->ks3);
+            dat->stream.cbc = enc ? des_t4_ede3_cbc_encrypt :
+                des_t4_ede3_cbc_decrypt;
+            return 1;
+        }
+    }
+# endif
+# ifdef EVP_CHECK_DES_KEY
+    if (DES_set_key_checked(&deskey[0], &dat->ks1)
+        || DES_set_key_checked(&deskey[1], &dat->ks2)
+        || DES_set_key_checked(&deskey[2], &dat->ks3))
+        return 0;
+# else
+    DES_set_key_unchecked(&deskey[0], &dat->ks1);
+    DES_set_key_unchecked(&deskey[1], &dat->ks2);
+    DES_set_key_unchecked(&deskey[2], &dat->ks3);
+# endif
+    return 1;
+}
+
+static int des3_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+{
+
+    DES_cblock *deskey = ptr;
+
+    switch (type) {
+    case EVP_CTRL_RAND_KEY:
+        if (RAND_bytes(ptr, c->key_len) <= 0)
+            return 0;
+        DES_set_odd_parity(deskey);
+        if (c->key_len >= 16)
+            DES_set_odd_parity(deskey + 1);
+        if (c->key_len >= 24)
+            DES_set_odd_parity(deskey + 2);
+        return 1;
+
+    default:
+        return -1;
+    }
+}
+
+const EVP_CIPHER *EVP_des_ede(void)
+{
+    return &des_ede_ecb;
+}
+
+const EVP_CIPHER *EVP_des_ede3(void)
+{
+    return &des_ede3_ecb;
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sha1-sparcv9.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,428 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <[email protected]>.
+# ====================================================================
+
+# Performance improvement is not really impressive on pre-T1 CPU: +8%
+# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
+# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
+# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
+# X[16] vector is packed to 8 64-bit registers and as result nothing
+# is spilled on stack. In addition input data is loaded in compact
+# instruction sequence, thus minimizing the window when the code is
+# subject to [inter-thread] cache-thrashing hazard. The goal is to
+# ensure scalability on UltraSPARC T1, or rather to avoid decay when
+# amount of active threads exceeds the number of physical cores.
+
+# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
+# faster than software. Multi-process benchmark saturates at 11x
+# single-process result on 8-core processor, or ~9GBps per 2.85GHz
+# socket.
+
+$output=shift;
+open STDOUT,">$output";
+
+@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
+$rot1m="%g2";
+$tmp64="%g3";
+$Xi="%g4";
+$A="%l0";
+$B="%l1";
+$C="%l2";
+$D="%l3";
+$E="%l4";
+@V=($A,$B,$C,$D,$E);
+$K_00_19="%l5";
+$K_20_39="%l6";
+$K_40_59="%l7";
+$K_60_79="%g5";
+@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
+
+$ctx="%i0";
+$inp="%i1";
+$len="%i2";
+$tmp0="%i3";
+$tmp1="%i4";
+$tmp2="%i5";
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi=($i&1)?@X[($i/2)%8]:$Xi;
+
+$code.=<<___;
+	sll	$a,5,$tmp0		!! $i
+	add	@K[$i/20],$e,$e
+	srl	$a,27,$tmp1
+	add	$tmp0,$e,$e
+	and	$c,$b,$tmp0
+	add	$tmp1,$e,$e
+	sll	$b,30,$tmp2
+	andn	$d,$b,$tmp1
+	srl	$b,2,$b
+	or	$tmp1,$tmp0,$tmp1
+	or	$tmp2,$b,$b
+	add	$xi,$e,$e
+___
+if ($i&1 && $i<15) {
+	$code.=
+	"	srlx	@X[(($i+1)/2)%8],32,$Xi\n";
+}
+$code.=<<___;
+	add	$tmp1,$e,$e
+___
+}
+
+sub Xupdate {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i/2;
+
+if ($i&1) {
+$code.=<<___;
+	sll	$a,5,$tmp0		!! $i
+	add	@K[$i/20],$e,$e
+	srl	$a,27,$tmp1
+___
+} else {
+$code.=<<___;
+	sllx	@X[($j+6)%8],32,$Xi	! Xupdate($i)
+	xor	@X[($j+1)%8],@X[$j%8],@X[$j%8]
+	srlx	@X[($j+7)%8],32,$tmp1
+	xor	@X[($j+4)%8],@X[$j%8],@X[$j%8]
+	sll	$a,5,$tmp0		!! $i
+	or	$tmp1,$Xi,$Xi
+	add	@K[$i/20],$e,$e		!!
+	xor	$Xi,@X[$j%8],@X[$j%8]
+	srlx	@X[$j%8],31,$Xi
+	add	@X[$j%8],@X[$j%8],@X[$j%8]
+	and	$Xi,$rot1m,$Xi
+	andn	@X[$j%8],$rot1m,@X[$j%8]
+	srl	$a,27,$tmp1		!!
+	or	$Xi,@X[$j%8],@X[$j%8]
+___
+}
+}
+
+sub BODY_16_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+
+	&Xupdate(@_);
+    if ($i&1) {
+	$xi=@X[($i/2)%8];
+    } else {
+	$xi=$Xi;
+	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+	add	$tmp0,$e,$e		!!
+	and	$c,$b,$tmp0
+	add	$tmp1,$e,$e
+	sll	$b,30,$tmp2
+	add	$xi,$e,$e
+	andn	$d,$b,$tmp1
+	srl	$b,2,$b
+	or	$tmp1,$tmp0,$tmp1
+	or	$tmp2,$b,$b
+	add	$tmp1,$e,$e
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi;
+	&Xupdate(@_);
+    if ($i&1) {
+	$xi=@X[($i/2)%8];
+    } else {
+	$xi=$Xi;
+	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+	add	$tmp0,$e,$e		!!
+	xor	$c,$b,$tmp0
+	add	$tmp1,$e,$e
+	sll	$b,30,$tmp2
+	xor	$d,$tmp0,$tmp1
+	srl	$b,2,$b
+	add	$tmp1,$e,$e
+	or	$tmp2,$b,$b
+	add	$xi,$e,$e
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $xi;
+	&Xupdate(@_);
+    if ($i&1) {
+	$xi=@X[($i/2)%8];
+    } else {
+	$xi=$Xi;
+	$code.="\tsrlx	@X[($i/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+	add	$tmp0,$e,$e		!!
+	and	$c,$b,$tmp0
+	add	$tmp1,$e,$e
+	sll	$b,30,$tmp2
+	or	$c,$b,$tmp1
+	srl	$b,2,$b
+	and	$d,$tmp1,$tmp1
+	add	$xi,$e,$e
+	or	$tmp1,$tmp0,$tmp1
+	or	$tmp2,$b,$b
+	add	$tmp1,$e,$e
+___
+}
+
+$code.=<<___;
+#include "sparc_arch.h"
+#include <openssl/fipssyms.h>
+
+#ifdef __arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+.section	".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.align	32
+.globl	sha1_block_data_order
+sha1_block_data_order:
+	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
+
+	andcc	%g1, CFR_SHA1, %g0
+	be	.Lsoftware
+	nop
+
+	ld	[%o0 + 0x00], %f0	! load context
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x0c], %f3
+	bne,pn	%icc, .Lhwunaligned
+	 ld	[%o0 + 0x10], %f4
+
+.Lhw_loop:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	subcc	%o2, 1, %o2		! done yet? 
+	ldd	[%o1 + 0x38], %f22
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	.word	0x81b02820		! SHA1
+
+	bne,pt	SIZE_T_CC, .Lhw_loop
+	nop
+
+.Lhwfinish:
+	st	%f0, [%o0 + 0x00]	! store context
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	retl
+	st	%f4, [%o0 + 0x10]
+
+.align	8
+.Lhwunaligned:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x40], %f26
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	.word	0x81b02820		! SHA1
+
+	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
+	for	%f26, %f26, %f10	! %f10=%f26
+
+	ba	.Lhwfinish
+	nop
+
+.align	16
+.Lsoftware:
+	save	%sp,-STACK_FRAME,%sp
+	sllx	$len,6,$len
+	add	$inp,$len,$len
+
+	or	%g0,1,$rot1m
+	sllx	$rot1m,32,$rot1m
+	or	$rot1m,1,$rot1m
+
+	ld	[$ctx+0],$A
+	ld	[$ctx+4],$B
+	ld	[$ctx+8],$C
+	ld	[$ctx+12],$D
+	ld	[$ctx+16],$E
+	andn	$inp,7,$tmp0
+
+	sethi	%hi(0x5a827999),$K_00_19
+	or	$K_00_19,%lo(0x5a827999),$K_00_19
+	sethi	%hi(0x6ed9eba1),$K_20_39
+	or	$K_20_39,%lo(0x6ed9eba1),$K_20_39
+	sethi	%hi(0x8f1bbcdc),$K_40_59
+	or	$K_40_59,%lo(0x8f1bbcdc),$K_40_59
+	sethi	%hi(0xca62c1d6),$K_60_79
+	or	$K_60_79,%lo(0xca62c1d6),$K_60_79
+
+.Lloop:
+	ldx	[$tmp0+0],@X[0]
+	ldx	[$tmp0+16],@X[2]
+	ldx	[$tmp0+32],@X[4]
+	ldx	[$tmp0+48],@X[6]
+	and	$inp,7,$tmp1
+	ldx	[$tmp0+8],@X[1]
+	sll	$tmp1,3,$tmp1
+	ldx	[$tmp0+24],@X[3]
+	subcc	%g0,$tmp1,$tmp2	! should be 64-$tmp1, but -$tmp1 works too
+	ldx	[$tmp0+40],@X[5]
+	bz,pt	%icc,.Laligned
+	ldx	[$tmp0+56],@X[7]
+
+	sllx	@X[0],$tmp1,@X[0]
+	ldx	[$tmp0+64],$tmp64
+___
+for($i=0;$i<7;$i++)
+{   $code.=<<___;
+	srlx	@X[$i+1],$tmp2,$Xi
+	sllx	@X[$i+1],$tmp1,@X[$i+1]
+	or	$Xi,@X[$i],@X[$i]
+___
+}
+$code.=<<___;
+	srlx	$tmp64,$tmp2,$tmp64
+	or	$tmp64,@X[7],@X[7]
+.Laligned:
+	srlx	@X[0],32,$Xi
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++)	{ &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
+for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+
+	ld	[$ctx+0],@X[0]
+	ld	[$ctx+4],@X[1]
+	ld	[$ctx+8],@X[2]
+	ld	[$ctx+12],@X[3]
+	add	$inp,64,$inp
+	ld	[$ctx+16],@X[4]
+	cmp	$inp,$len
+
+	add	$A,@X[0],$A
+	st	$A,[$ctx+0]
+	add	$B,@X[1],$B
+	st	$B,[$ctx+4]
+	add	$C,@X[2],$C
+	st	$C,[$ctx+8]
+	add	$D,@X[3],$D
+	st	$D,[$ctx+12]
+	add	$E,@X[4],$E
+	st	$E,[$ctx+16]
+
+	bne	SIZE_T_CC,.Lloop
+	andn	$inp,7,$tmp0
+
+	ret
+	restore
+.type	sha1_block_data_order,#function
+.size	sha1_block_data_order,(.-sha1_block_data_order)
+.asciz	"SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = (	"faligndata"	=> 0x048,
+		"for"		=> 0x07c	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+    foreach ($rs1,$rs2,$rd) {
+	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
+	else			{ return $ref; }
+    }
+    return  sprintf ".word\t0x%08x !%s",
+		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
+		    $ref;
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unvis($1,$2,$3,$4)
+	 /ge;
+	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unalignaddr($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sha512-sparcv9.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <[email protected]>.
+# ====================================================================
+
+# SHA256 performance improvement over compiler generated code varies
+# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
+# build]. Just like in SHA1 module I aim to ensure scalability on
+# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
+
+# SHA512 on pre-T1 UltraSPARC.
+#
+# Performance is >75% better than 64-bit code generated by Sun C and
+# over 2x than 32-bit code. X[16] resides on stack, but access to it
+# is scheduled for L2 latency and staged through 32 least significant
+# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
+# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
+# good [optimal coefficient is 50%].
+#
+# SHA512 on UltraSPARC T1.
+#
+# It's not any faster than 64-bit code generated by Sun C 5.8. This is
+# because 64-bit code generator has the advantage of using 64-bit
+# loads(*) to access X[16], which I consciously traded for 32-/64-bit
+# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
+# code by 60%, not to mention that it doesn't suffer from severe decay
+# when running 4 times physical cores threads and that it leaves gcc
+# [3.4] behind by over 4x factor! If compared to SHA256, single thread
+# performance is only 10% better, but overall throughput for maximum
+# amount of threads for given CPU exceeds corresponding one of SHA256
+# by 30% [again, optimal coefficient is 50%].
+#
+# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
+#	in-order, i.e. load instruction has to complete prior next
+#	instruction in given thread is executed, even if the latter is
+#	not dependent on load result! This means that on T1 two 32-bit
+#	loads are always slower than one 64-bit load. Once again this
+#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
+#	2x32-bit loads can be as fast as 1x64-bit ones.
+#
+# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
+# which is 9.3x/11.1x faster than software. Multi-process benchmark
+# saturates at 11.5x single-process result on 8-core processor, or
+# ~11/16GBps per 2.85GHz socket.
+
+$output=shift;
+open STDOUT,">$output";
+
+if ($output =~ /512/) {
+	$label="512";
+	$SZ=8;
+	$LD="ldx";		# load from memory
+	$ST="stx";		# store to memory
+	$SLL="sllx";		# shift left logical
+	$SRL="srlx";		# shift right logical
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=( 7, 1, 8);	# right shift first
+	@sigma1=( 6,19,61);	# right shift first
+	$lastK=0x817;
+	$rounds=80;
+	$align=4;
+
+	$locals=16*$SZ;		# X[16]
+
+	$A="%o0";
+	$B="%o1";
+	$C="%o2";
+	$D="%o3";
+	$E="%o4";
+	$F="%o5";
+	$G="%g1";
+	$H="%o7";
+	@V=($A,$B,$C,$D,$E,$F,$G,$H);
+} else {
+	$label="256";
+	$SZ=4;
+	$LD="ld";		# load from memory
+	$ST="st";		# store to memory
+	$SLL="sll";		# shift left logical
+	$SRL="srl";		# shift right logical
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 3, 7,18);	# right shift first
+	@sigma1=(10,17,19);	# right shift first
+	$lastK=0x8f2;
+	$rounds=64;
+	$align=8;
+
+	$locals=0;		# X[16] is register resident
+	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
+	
+	$A="%l0";
+	$B="%l1";
+	$C="%l2";
+	$D="%l3";
+	$E="%l4";
+	$F="%l5";
+	$G="%l6";
+	$H="%l7";
+	@V=($A,$B,$C,$D,$E,$F,$G,$H);
+}
+$T1="%g2";
+$tmp0="%g3";
+$tmp1="%g4";
+$tmp2="%g5";
+
+$ctx="%i0";
+$inp="%i1";
+$len="%i2";
+$Ktbl="%i3";
+$tmp31="%i4";
+$tmp32="%i5";
+
+########### SHA256
+$Xload = sub {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+
+    if ($i==0) {
+$code.=<<___;
+	ldx	[$inp+0],@X[0]
+	ldx	[$inp+16],@X[2]
+	ldx	[$inp+32],@X[4]
+	ldx	[$inp+48],@X[6]
+	ldx	[$inp+8],@X[1]
+	ldx	[$inp+24],@X[3]
+	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
+	ldx	[$inp+40],@X[5]
+	bz,pt	%icc,.Laligned
+	ldx	[$inp+56],@X[7]
+
+	sllx	@X[0],$tmp31,@X[0]
+	ldx	[$inp+64],$T1
+___
+for($j=0;$j<7;$j++)
+{   $code.=<<___;
+	srlx	@X[$j+1],$tmp32,$tmp1
+	sllx	@X[$j+1],$tmp31,@X[$j+1]
+	or	$tmp1,@X[$j],@X[$j]
+___
+}
+$code.=<<___;
+	srlx	$T1,$tmp32,$T1
+	or	$T1,@X[7],@X[7]
+.Laligned:
+___
+    }
+
+    if ($i&1) {
+	$code.="\tadd	@X[$i/2],$h,$T1\n";
+    } else {
+	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
+    }
+} if ($SZ==4);
+
+########### SHA512
+$Xload = sub {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
+
+$code.=<<___ if ($i==0);
+	ld	[$inp+0],%l0
+	ld	[$inp+4],%l1
+	ld	[$inp+8],%l2
+	ld	[$inp+12],%l3
+	ld	[$inp+16],%l4
+	ld	[$inp+20],%l5
+	ld	[$inp+24],%l6
+	cmp	$tmp31,0
+	ld	[$inp+28],%l7
+___
+$code.=<<___ if ($i<15);
+	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
+	add	$tmp31,32,$tmp0
+	sllx	@pair[0],$tmp0,$tmp1
+	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
+	srlx	@pair[2],$tmp32,@pair[1]
+	or	$tmp1,$tmp2,$tmp2
+	or	@pair[1],$tmp2,$tmp2
+	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
+	add	$h,$tmp2,$T1
+	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
+___
+$code.=<<___ if ($i==12);
+	bnz,a,pn	%icc,.+8
+	ld	[$inp+128],%l0
+___
+$code.=<<___ if ($i==15);
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
+	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
+	add	$tmp31,32,$tmp0
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
+	sllx	@pair[0],$tmp0,$tmp1
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
+	srlx	@pair[2],$tmp32,@pair[1]
+	or	$tmp1,$tmp2,$tmp2
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
+	or	@pair[1],$tmp2,$tmp2
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
+	add	$h,$tmp2,$T1
+	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
+___
+} if ($SZ==8);
+
+########### common
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+
+    if ($i<16) {
+	&$Xload(@_);
+    } else {
+	$code.="\tadd	$h,$T1,$T1\n";
+    }
+
+$code.=<<___;
+	$SRL	$e,@Sigma1[0],$h	!! $i
+	xor	$f,$g,$tmp2
+	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
+	and	$e,$tmp2,$tmp2
+	$SRL	$e,@Sigma1[1],$tmp0
+	xor	$tmp1,$h,$h
+	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
+	xor	$tmp0,$h,$h
+	$SRL	$e,@Sigma1[2],$tmp0
+	xor	$tmp1,$h,$h
+	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
+	xor	$tmp0,$h,$h
+	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
+	xor	$tmp1,$h,$tmp0		! Sigma1(e)
+
+	$SRL	$a,@Sigma0[0],$h
+	add	$tmp2,$T1,$T1
+	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
+	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
+	add	$tmp0,$T1,$T1
+	$SRL	$a,@Sigma0[1],$tmp0
+	xor	$tmp1,$h,$h
+	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
+	xor	$tmp0,$h,$h
+	$SRL	$a,@Sigma0[2],$tmp0
+	xor	$tmp1,$h,$h	
+	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
+	xor	$tmp0,$h,$h
+	xor	$tmp1,$h,$h		! Sigma0(a)
+
+	or	$a,$b,$tmp0
+	and	$a,$b,$tmp1
+	and	$c,$tmp0,$tmp0
+	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
+	add	$tmp2,$T1,$T1		! +=K[$i]
+	add	$tmp1,$h,$h
+
+	add	$T1,$d,$d
+	add	$T1,$h,$h
+___
+}
+
+########### SHA256
+$BODY_16_XX = sub {
+my $i=@_[0];
+my $xi;
+
+    if ($i&1) {
+	$xi=$tmp32;
+	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
+    } else {
+	$xi=@X[(($i+1)/2)%8];
+    }
+$code.=<<___;
+	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
+	sll	$xi,`32-@sigma0[2]`,$tmp1
+	srl	$xi,@sigma0[1],$tmp0
+	xor	$tmp1,$T1,$T1
+	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
+	xor	$tmp0,$T1,$T1
+	srl	$xi,@sigma0[2],$tmp0
+	xor	$tmp1,$T1,$T1
+___
+    if ($i&1) {
+	$xi=@X[(($i+14)/2)%8];
+    } else {
+	$xi=$tmp32;
+	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
+    }
+$code.=<<___;
+	srl	$xi,@sigma1[0],$tmp2
+	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
+	sll	$xi,`32-@sigma1[2]`,$tmp1
+	srl	$xi,@sigma1[1],$tmp0
+	xor	$tmp1,$tmp2,$tmp2
+	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
+	xor	$tmp0,$tmp2,$tmp2
+	srl	$xi,@sigma1[2],$tmp0
+	xor	$tmp1,$tmp2,$tmp2
+___
+    if ($i&1) {
+	$xi=@X[($i/2)%8];
+$code.=<<___;
+	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
+	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
+	srl	@X[($i/2)%8],0,$tmp0
+	add	$tmp2,$tmp1,$tmp1
+	add	$xi,$T1,$T1			! +=X[i]
+	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
+	add	$tmp1,$T1,$T1
+
+	srl	$T1,0,$T1
+	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
+___
+    } else {
+	$xi=@X[(($i+9)/2)%8];
+$code.=<<___;
+	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
+	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
+	add	$xi,$T1,$T1			! +=X[i+9]
+	add	$tmp2,$tmp1,$tmp1
+	srl	@X[($i/2)%8],0,@X[($i/2)%8]
+	add	$tmp1,$T1,$T1
+
+	sllx	$T1,32,$tmp0
+	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
+___
+    }
+    &BODY_00_15(@_);
+} if ($SZ==4);
+
+########### SHA512
+$BODY_16_XX = sub {
+my $i=@_[0];
+my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
+
+$code.=<<___;
+	sllx	%l2,32,$tmp0		!! Xupdate($i)
+	or	%l3,$tmp0,$tmp0
+
+	srlx	$tmp0,@sigma0[0],$T1
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
+	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
+	srlx	$tmp0,@sigma0[1],$tmp0
+	xor	$tmp1,$T1,$T1
+	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
+	xor	$tmp0,$T1,$T1
+	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
+	xor	$tmp1,$T1,$T1
+	sllx	%l6,32,$tmp2
+	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
+	or	%l7,$tmp2,$tmp2
+
+	srlx	$tmp2,@sigma1[0],$tmp1
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
+	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
+	srlx	$tmp2,@sigma1[1],$tmp2
+	xor	$tmp0,$tmp1,$tmp1
+	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
+	xor	$tmp2,$tmp1,$tmp1
+	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
+	xor	$tmp0,$tmp1,$tmp1
+	sllx	%l4,32,$tmp0
+	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
+	or	%l5,$tmp0,$tmp0
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
+
+	sllx	%l0,32,$tmp2
+	add	$tmp1,$T1,$T1
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
+	or	%l1,$tmp2,$tmp2
+	add	$tmp0,$T1,$T1		! +=X[$i+9]
+	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
+	add	$tmp2,$T1,$T1		! +=X[$i]
+	$ST	$T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
+___
+    &BODY_00_15(@_);
+} if ($SZ==8);
+
+$code.=<<___;
+#include "sparc_arch.h"
+#include <openssl/fipssyms.h>
+
+#ifdef __arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+.section	".text",#alloc,#execinstr
+
+.align	64
+K${label}:
+.type	K${label},#object
+___
+if ($SZ==4) {
+$code.=<<___;
+	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+___
+}
+$code.=<<___;
+.size	K${label},.-K${label}
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl	sha${label}_block_data_order
+.align	32
+sha${label}_block_data_order:
+	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
+
+	andcc	%g1, CFR_SHA${label}, %g0
+	be	.Lsoftware
+	nop
+___
+$code.=<<___ if ($SZ==8); 		# SHA512
+	ldd	[%o0 + 0x00], %f0	! load context
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	andcc	%o1, 0x7, %g0
+	ldd	[%o0 + 0x30], %f12
+	bne,pn	%icc, .Lhwunaligned
+	 ldd	[%o0 + 0x38], %f14
+
+.Lhwaligned_loop:
+	ldd	[%o1 + 0x00], %f16
+	ldd	[%o1 + 0x08], %f18
+	ldd	[%o1 + 0x10], %f20
+	ldd	[%o1 + 0x18], %f22
+	ldd	[%o1 + 0x20], %f24
+	ldd	[%o1 + 0x28], %f26
+	ldd	[%o1 + 0x30], %f28
+	ldd	[%o1 + 0x38], %f30
+	ldd	[%o1 + 0x40], %f32
+	ldd	[%o1 + 0x48], %f34
+	ldd	[%o1 + 0x50], %f36
+	ldd	[%o1 + 0x58], %f38
+	ldd	[%o1 + 0x60], %f40
+	ldd	[%o1 + 0x68], %f42
+	ldd	[%o1 + 0x70], %f44
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x78], %f46
+	add	%o1, 0x80, %o1
+	prefetch [%o1 + 63], 20
+	prefetch [%o1 + 64+63], 20
+
+	.word	0x81b02860		! SHA512
+
+	bne,pt	SIZE_T_CC, .Lhwaligned_loop
+	nop
+
+.Lhwfinish:
+	std	%f0, [%o0 + 0x00]	! store context
+	std	%f2, [%o0 + 0x08]
+	std	%f4, [%o0 + 0x10]
+	std	%f6, [%o0 + 0x18]
+	std	%f8, [%o0 + 0x20]
+	std	%f10, [%o0 + 0x28]
+	std	%f12, [%o0 + 0x30]
+	retl
+	 std	%f14, [%o0 + 0x38]
+
+.align	16
+.Lhwunaligned:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f18
+.Lhwunaligned_loop:
+	ldd	[%o1 + 0x08], %f20
+	ldd	[%o1 + 0x10], %f22
+	ldd	[%o1 + 0x18], %f24
+	ldd	[%o1 + 0x20], %f26
+	ldd	[%o1 + 0x28], %f28
+	ldd	[%o1 + 0x30], %f30
+	ldd	[%o1 + 0x38], %f32
+	ldd	[%o1 + 0x40], %f34
+	ldd	[%o1 + 0x48], %f36
+	ldd	[%o1 + 0x50], %f38
+	ldd	[%o1 + 0x58], %f40
+	ldd	[%o1 + 0x60], %f42
+	ldd	[%o1 + 0x68], %f44
+	ldd	[%o1 + 0x70], %f46
+	ldd	[%o1 + 0x78], %f48
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x80], %f50
+	add	%o1, 0x80, %o1
+	prefetch [%o1 + 63], 20
+	prefetch [%o1 + 64+63], 20
+
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+	faligndata %f26, %f28, %f24
+	faligndata %f28, %f30, %f26
+	faligndata %f30, %f32, %f28
+	faligndata %f32, %f34, %f30
+	faligndata %f34, %f36, %f32
+	faligndata %f36, %f38, %f34
+	faligndata %f38, %f40, %f36
+	faligndata %f40, %f42, %f38
+	faligndata %f42, %f44, %f40
+	faligndata %f44, %f46, %f42
+	faligndata %f46, %f48, %f44
+	faligndata %f48, %f50, %f46
+
+	.word	0x81b02860		! SHA512
+
+	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
+	for	%f50, %f50, %f18	! %f18=%f50
+
+	ba	.Lhwfinish
+	nop
+___
+$code.=<<___ if ($SZ==4); 		# SHA256
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	ld	[%o0 + 0x0c], %f3
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x18], %f6
+	bne,pn	%icc, .Lhwunaligned
+	 ld	[%o0 + 0x1c], %f7
+
+.Lhwloop:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x38], %f22
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	.word	0x81b02840		! SHA256
+
+	bne,pt	SIZE_T_CC, .Lhwloop
+	nop
+
+.Lhwfinish:
+	st	%f0, [%o0 + 0x00]	! store context
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	st	%f5, [%o0 + 0x14]
+	st	%f6, [%o0 + 0x18]
+	retl
+	 st	%f7, [%o0 + 0x1c]
+
+.align	8
+.Lhwunaligned:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x40], %f26
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	.word	0x81b02840		! SHA256
+
+	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
+	for	%f26, %f26, %f10	! %f10=%f26
+
+	ba	.Lhwfinish
+	nop
+___
+$code.=<<___;
+.align	16
+.Lsoftware:
+	save	%sp,-STACK_FRAME-$locals,%sp
+	and	$inp,`$align-1`,$tmp31
+	sllx	$len,`log(16*$SZ)/log(2)`,$len
+	andn	$inp,`$align-1`,$inp
+	sll	$tmp31,3,$tmp31
+	add	$inp,$len,$len
+___
+$code.=<<___ if ($SZ==8); # SHA512
+	mov	32,$tmp32
+	sub	$tmp32,$tmp31,$tmp32
+___
+$code.=<<___;
+.Lpic:	call	.+8
+	add	%o7,K${label}-.Lpic,$Ktbl
+
+	$LD	[$ctx+`0*$SZ`],$A
+	$LD	[$ctx+`1*$SZ`],$B
+	$LD	[$ctx+`2*$SZ`],$C
+	$LD	[$ctx+`3*$SZ`],$D
+	$LD	[$ctx+`4*$SZ`],$E
+	$LD	[$ctx+`5*$SZ`],$F
+	$LD	[$ctx+`6*$SZ`],$G
+	$LD	[$ctx+`7*$SZ`],$H
+
+.Lloop:
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".L16_xx:\n";
+for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	and	$tmp2,0xfff,$tmp2
+	cmp	$tmp2,$lastK
+	bne	.L16_xx
+	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
+
+___
+$code.=<<___ if ($SZ==4); # SHA256
+	$LD	[$ctx+`0*$SZ`],@X[0]
+	$LD	[$ctx+`1*$SZ`],@X[1]
+	$LD	[$ctx+`2*$SZ`],@X[2]
+	$LD	[$ctx+`3*$SZ`],@X[3]
+	$LD	[$ctx+`4*$SZ`],@X[4]
+	$LD	[$ctx+`5*$SZ`],@X[5]
+	$LD	[$ctx+`6*$SZ`],@X[6]
+	$LD	[$ctx+`7*$SZ`],@X[7]
+
+	add	$A,@X[0],$A
+	$ST	$A,[$ctx+`0*$SZ`]
+	add	$B,@X[1],$B
+	$ST	$B,[$ctx+`1*$SZ`]
+	add	$C,@X[2],$C
+	$ST	$C,[$ctx+`2*$SZ`]
+	add	$D,@X[3],$D
+	$ST	$D,[$ctx+`3*$SZ`]
+	add	$E,@X[4],$E
+	$ST	$E,[$ctx+`4*$SZ`]
+	add	$F,@X[5],$F
+	$ST	$F,[$ctx+`5*$SZ`]
+	add	$G,@X[6],$G
+	$ST	$G,[$ctx+`6*$SZ`]
+	add	$H,@X[7],$H
+	$ST	$H,[$ctx+`7*$SZ`]
+___
+$code.=<<___ if ($SZ==8); # SHA512
+	ld	[$ctx+`0*$SZ+0`],%l0
+	ld	[$ctx+`0*$SZ+4`],%l1
+	ld	[$ctx+`1*$SZ+0`],%l2
+	ld	[$ctx+`1*$SZ+4`],%l3
+	ld	[$ctx+`2*$SZ+0`],%l4
+	ld	[$ctx+`2*$SZ+4`],%l5
+	ld	[$ctx+`3*$SZ+0`],%l6
+
+	sllx	%l0,32,$tmp0
+	ld	[$ctx+`3*$SZ+4`],%l7
+	sllx	%l2,32,$tmp1
+	or	%l1,$tmp0,$tmp0
+	or	%l3,$tmp1,$tmp1
+	add	$tmp0,$A,$A
+	add	$tmp1,$B,$B
+	$ST	$A,[$ctx+`0*$SZ`]
+	sllx	%l4,32,$tmp2
+	$ST	$B,[$ctx+`1*$SZ`]
+	sllx	%l6,32,$T1
+	or	%l5,$tmp2,$tmp2
+	or	%l7,$T1,$T1
+	add	$tmp2,$C,$C
+	$ST	$C,[$ctx+`2*$SZ`]
+	add	$T1,$D,$D
+	$ST	$D,[$ctx+`3*$SZ`]
+
+	ld	[$ctx+`4*$SZ+0`],%l0
+	ld	[$ctx+`4*$SZ+4`],%l1
+	ld	[$ctx+`5*$SZ+0`],%l2
+	ld	[$ctx+`5*$SZ+4`],%l3
+	ld	[$ctx+`6*$SZ+0`],%l4
+	ld	[$ctx+`6*$SZ+4`],%l5
+	ld	[$ctx+`7*$SZ+0`],%l6
+
+	sllx	%l0,32,$tmp0
+	ld	[$ctx+`7*$SZ+4`],%l7
+	sllx	%l2,32,$tmp1
+	or	%l1,$tmp0,$tmp0
+	or	%l3,$tmp1,$tmp1
+	add	$tmp0,$E,$E
+	add	$tmp1,$F,$F
+	$ST	$E,[$ctx+`4*$SZ`]
+	sllx	%l4,32,$tmp2
+	$ST	$F,[$ctx+`5*$SZ`]
+	sllx	%l6,32,$T1
+	or	%l5,$tmp2,$tmp2
+	or	%l7,$T1,$T1
+	add	$tmp2,$G,$G
+	$ST	$G,[$ctx+`6*$SZ`]
+	add	$T1,$H,$H
+	$ST	$H,[$ctx+`7*$SZ`]
+___
+$code.=<<___;
+	add	$inp,`16*$SZ`,$inp		! advance inp
+	cmp	$inp,$len
+	bne	SIZE_T_CC,.Lloop
+	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
+
+	ret
+	restore
+.type	sha${label}_block_data_order,#function
+.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
+.asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my $ref,$opf;
+my %visopf = (	"faligndata"	=> 0x048,
+		"for"		=> 0x07c	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+    foreach ($rs1,$rs2,$rd) {
+	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
+	else			{ return $ref; }
+    }
+    return  sprintf ".word\t0x%08x !%s",
+		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
+		    $ref;
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unvis($1,$2,$3,$4)
+	 /ge;
+	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unalignaddr($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparc_arch.h	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,101 @@
+#ifndef __SPARC_ARCH_H__
+# define __SPARC_ARCH_H__
+
+# define SPARCV9_TICK_PRIVILEGED (1<<0)
+# define SPARCV9_PREFER_FPU      (1<<1)
+# define SPARCV9_VIS1            (1<<2)
+# define SPARCV9_VIS2            (1<<3)/* reserved */
+# define SPARCV9_FMADD           (1<<4)/* reserved for SPARC64 V */
+# define SPARCV9_BLK             (1<<5)/* VIS1 block copy */
+# define SPARCV9_VIS3            (1<<6)
+# define SPARCV9_RANDOM          (1<<7)
+# define SPARCV9_64BIT_STACK     (1<<8)
+
+/*
+ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
+ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
+ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
+ */
+# define CFR_AES         0x00000001/* Supports AES opcodes */
+# define CFR_DES         0x00000002/* Supports DES opcodes */
+# define CFR_KASUMI      0x00000004/* Supports KASUMI opcodes */
+# define CFR_CAMELLIA    0x00000008/* Supports CAMELLIA opcodes */
+# define CFR_MD5         0x00000010/* Supports MD5 opcodes */
+# define CFR_SHA1        0x00000020/* Supports SHA1 opcodes */
+# define CFR_SHA256      0x00000040/* Supports SHA256 opcodes */
+# define CFR_SHA512      0x00000080/* Supports SHA512 opcodes */
+# define CFR_MPMUL       0x00000100/* Supports MPMUL opcodes */
+# define CFR_MONTMUL     0x00000200/* Supports MONTMUL opcodes */
+# define CFR_MONTSQR     0x00000400/* Supports MONTSQR opcodes */
+# define CFR_CRC32C      0x00000800/* Supports CRC32C opcodes */
+
+# if defined(OPENSSL_PIC) && !defined(__PIC__)
+#  define __PIC__
+# endif
+
+# if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
+#  define __arch64__
+# endif
+
+# define SPARC_PIC_THUNK(reg)    \
+        .align  32;             \
+.Lpic_thunk:                    \
+        jmp     %o7 + 8;        \
+         add    %o7, reg, reg;
+
+# define SPARC_PIC_THUNK_CALL(reg)                       \
+        sethi   %hi(_GLOBAL_OFFSET_TABLE_-4), reg;      \
+        call    .Lpic_thunk;                            \
+         or     reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
+
+# if 1
+#  define SPARC_SETUP_GOT_REG(reg)       SPARC_PIC_THUNK_CALL(reg)
+# else
+#  define SPARC_SETUP_GOT_REG(reg)       \
+        sethi   %hi(_GLOBAL_OFFSET_TABLE_-4), reg;      \
+        call    .+8;                                    \
+        or      reg,%lo(_GLOBAL_OFFSET_TABLE_+4), reg;  \
+        add     %o7, reg, reg
+# endif
+
+# if defined(__arch64__)
+
+#  define SPARC_LOAD_ADDRESS(SYM, reg)   \
+        setx    SYM, %o7, reg;
+#  define LDPTR          ldx
+#  define SIZE_T_CC      %xcc
+#  define STACK_FRAME    192
+#  define STACK_BIAS     2047
+#  define STACK_7thARG   (STACK_BIAS+176)
+
+# else
+
+#  define SPARC_LOAD_ADDRESS(SYM, reg)   \
+        set     SYM, reg;
+#  define LDPTR          ld
+#  define SIZE_T_CC      %icc
+#  define STACK_FRAME    112
+#  define STACK_BIAS     0
+#  define STACK_7thARG   92
+#  define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg)
+
+# endif
+
+# ifdef __PIC__
+#  undef SPARC_LOAD_ADDRESS
+#  undef SPARC_LOAD_ADDRESS_LEAF
+#  define SPARC_LOAD_ADDRESS(SYM, reg)   \
+        SPARC_SETUP_GOT_REG(reg);       \
+        sethi   %hi(SYM), %o7;          \
+        or      %o7, %lo(SYM), %o7;     \
+        LDPTR   [reg + %o7], reg;
+# endif
+
+# ifndef SPARC_LOAD_ADDRESS_LEAF
+#  define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) \
+        mov     %o7, tmp;                       \
+        SPARC_LOAD_ADDRESS(SYM, reg)            \
+        mov     tmp, %o7;
+# endif
+
+#endif                          /* __SPARC_ARCH_H__ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparct4-mont.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,1223 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <[email protected]> and Andy Polyakov
+# <[email protected]>. The module is licensed under 2-clause BSD
+# license. November 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# Montgomery squaring-n-multiplication module for SPARC T4.
+#
+# The module consists of three parts:
+#
+# 1) collection of "single-op" subroutines that perform single
+#    operation, Montgomery squaring or multiplication, on 512-,
+#    1024-, 1536- and 2048-bit operands;
+# 2) collection of "multi-op" subroutines that perform 5 squaring and
+#    1 multiplication operations on operands of above lengths;
+# 3) fall-back and helper VIS3 subroutines.
+#
+# RSA sign is dominated by multi-op subroutine, while RSA verify and
+# DSA - by single-op. Special note about 4096-bit RSA verify result.
+# Operands are too long for dedicated hardware and it's handled by
+# VIS3 code, which is why you don't see any improvement. It's surely
+# possible to improve it [by deploying 'mpmul' instruction], maybe in
+# the future...
+#
+# Performance improvement.
+#
+# 64-bit process, VIS3:
+#                   sign    verify    sign/s verify/s
+# rsa 1024 bits 0.000628s 0.000028s   1592.4  35434.4
+# rsa 2048 bits 0.003282s 0.000106s    304.7   9438.3
+# rsa 4096 bits 0.025866s 0.000340s     38.7   2940.9
+# dsa 1024 bits 0.000301s 0.000332s   3323.7   3013.9
+# dsa 2048 bits 0.001056s 0.001233s    946.9    810.8
+#
+# 64-bit process, this module:
+#                   sign    verify    sign/s verify/s
+# rsa 1024 bits 0.000256s 0.000016s   3904.4  61411.9
+# rsa 2048 bits 0.000946s 0.000029s   1056.8  34292.7
+# rsa 4096 bits 0.005061s 0.000340s    197.6   2940.5
+# dsa 1024 bits 0.000176s 0.000195s   5674.7   5130.5
+# dsa 2048 bits 0.000296s 0.000354s   3383.2   2827.6
+#
+######################################################################
+# 32-bit process, VIS3:
+#                   sign    verify    sign/s verify/s
+# rsa 1024 bits 0.000665s 0.000028s   1504.8  35233.3
+# rsa 2048 bits 0.003349s 0.000106s    298.6   9433.4
+# rsa 4096 bits 0.025959s 0.000341s     38.5   2934.8
+# dsa 1024 bits 0.000320s 0.000341s   3123.3   2929.6
+# dsa 2048 bits 0.001101s 0.001260s    908.2    793.4
+#
+# 32-bit process, this module:
+#                   sign    verify    sign/s verify/s
+# rsa 1024 bits 0.000301s 0.000017s   3317.1  60240.0
+# rsa 2048 bits 0.001034s 0.000030s    966.9  33812.7
+# rsa 4096 bits 0.005244s 0.000341s    190.7   2935.4
+# dsa 1024 bits 0.000201s 0.000205s   4976.1   4879.2
+# dsa 2048 bits 0.000328s 0.000360s   3051.1   2774.2
+#
+# 32-bit code is prone to performance degradation as interrupt rate
+# dispatched to CPU executing the code grows. This is because in
+# standard process of handling interrupt in 32-bit process context
+# upper halves of most integer registers used as input or output are
+# zeroed. This renders result invalid, and operation has to be re-run.
+# If CPU is "bothered" with timer interrupts only, the penalty is
+# hardly measurable. But in order to mitigate this problem for higher
+# interrupt rates contemporary Linux kernel recognizes biased stack
+# even in 32-bit process context and preserves full register contents.
+# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
+# for details.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+$code.=<<___;
+#include "sparc_arch.h"
+#include <openssl/fipssyms.h>
+
+#ifdef	__arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+.section	".text",#alloc,#execinstr
+
+#ifdef	__PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+___
+
+########################################################################
+# Register layout for mont[mul|sqr] instructions.
+# For details see "Oracle SPARC Architecture 2011" manual at
+# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
+#
+my @R=map("%f".2*$_,(0..11,30,31,12..29));
+my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
+my @A=(@N[0..13],@R[14..31]);
+my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
+
+########################################################################
+# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
+#			  const u64 *np,const BN_ULONG *n0);
+#
+sub generate_bn_mul_mont_t4() {
+my $NUM=shift;
+my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
+
+$code.=<<___;
+.globl	bn_mul_mont_t4_$NUM
+.align	32
+bn_mul_mont_t4_$NUM:
+#ifdef	__arch64__
+	mov	0,$sentinel
+	mov	-128,%g4
+#elif defined(SPARCV9_64BIT_STACK)
+	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
+	mov	-2047,%g4
+	and	%g1,SPARCV9_64BIT_STACK,%g1
+	movrz	%g1,0,%g4
+	mov	-1,$sentinel
+	add	%g4,-128,%g4
+#else
+	mov	-1,$sentinel
+	mov	-128,%g4
+#endif
+	sllx	$sentinel,32,$sentinel
+	save	%sp,%g4,%sp
+#ifndef	__arch64__
+	save	%sp,-128,%sp	! warm it up
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	restore
+	restore
+	restore
+	restore
+	restore
+	restore
+#endif
+	and	%sp,1,%g4
+	or	$sentinel,%fp,%fp
+	or	%g4,$sentinel,$sentinel
+
+	! copy arguments to global registers
+	mov	%i0,$rp
+	mov	%i1,$ap
+	mov	%i2,$bp
+	mov	%i3,$np
+	ld	[%i4+0],%f1	! load *n0
+	ld	[%i4+4],%f0
+	fsrc2	%f0,%f60
+___
+
+# load ap[$NUM] ########################################################
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@A[$i+1]:"%o7";
+$code.=<<___;
+	ld	[$ap+$i*8+0],$lo
+	ld	[$ap+$i*8+4],@A[$i]
+	sllx	@A[$i],32,@A[$i]
+	or	$lo,@A[$i],@A[$i]
+___
+}
+for(; $i<$NUM; $i++) {
+my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
+$code.=<<___;
+	ld	[$ap+$i*8+0],$lo
+	ld	[$ap+$i*8+4],$hi
+	fsrc2	$hi,@A[$i]
+___
+}
+# load np[$NUM] ########################################################
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@N[$i+1]:"%o7";
+$code.=<<___;
+	ld	[$np+$i*8+0],$lo
+	ld	[$np+$i*8+4],@N[$i]
+	sllx	@N[$i],32,@N[$i]
+	or	$lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for(; $i<28 && $i<$NUM; $i++) {
+my $lo=$i<27?@N[$i+1]:"%o7";
+$code.=<<___;
+	ld	[$np+$i*8+0],$lo
+	ld	[$np+$i*8+4],@N[$i]
+	sllx	@N[$i],32,@N[$i]
+	or	$lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
+$code.=<<___;
+	ld	[$np+$i*8+0],$lo
+	ld	[$np+$i*8+4],@N[$i]
+	sllx	@N[$i],32,@N[$i]
+	or	$lo,@N[$i],@N[$i]
+___
+}
+$code.=<<___;
+	cmp	$ap,$bp
+	be	SIZE_T_CC,.Lmsquare_$NUM
+	nop
+___
+
+# load bp[$NUM] ########################################################
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+my $lo=$i<13?@B[$i+1]:"%o7";
+$code.=<<___;
+	ld	[$bp+$i*8+0],$lo
+	ld	[$bp+$i*8+4],@B[$i]
+	sllx	@B[$i],32,@B[$i]
+	or	$lo,@B[$i],@B[$i]
+___
+}
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
+$code.=<<___;
+	ld	[$bp+$i*8+0],$lo
+	ld	[$bp+$i*8+4],@B[$i]
+	sllx	@B[$i],32,@B[$i]
+	or	$lo,@B[$i],@B[$i]
+___
+}
+# magic ################################################################
+$code.=<<___;
+	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
+.Lmresume_$NUM:
+	fbu,pn	%fcc3,.Lmabort_$NUM
+#ifndef	__arch64__
+	and	%fp,$sentinel,$sentinel
+	brz,pn	$sentinel,.Lmabort_$NUM
+#endif
+	nop
+#ifdef	__arch64__
+	restore
+	restore
+	restore
+	restore
+	restore
+#else
+	restore;		and	%fp,$sentinel,$sentinel
+	restore;		and	%fp,$sentinel,$sentinel
+	restore;		and	%fp,$sentinel,$sentinel
+	restore;		and	%fp,$sentinel,$sentinel
+	 brz,pn	$sentinel,.Lmabort1_$NUM
+	restore
+#endif
+___
+
+# save tp[$NUM] ########################################################
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+	movxtod	@A[$i],@R[$i]
+___
+}
+$code.=<<___;
+#ifdef	__arch64__
+	restore
+#else
+	 and	%fp,$sentinel,$sentinel
+	restore
+	 and	$sentinel,1,%o7
+	 and	%fp,$sentinel,$sentinel
+	 srl	%fp,0,%fp		! just in case?
+	 or	%o7,$sentinel,$sentinel
+	brz,a,pn $sentinel,.Lmdone_$NUM
+	mov	0,%i0		! return failure
+#endif
+___
+for($i=0; $i<12 && $i<$NUM; $i++) {
+@R[$i] =~ /%f([0-9]+)/;
+my $lo = "%f".($1+1);
+$code.=<<___;
+	st	$lo,[$rp+$i*8+0]
+	st	@R[$i],[$rp+$i*8+4]
+___
+}
+for(; $i<$NUM; $i++) {
+my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
+$code.=<<___;
+	fsrc2	@R[$i],$hi
+	st	$lo,[$rp+$i*8+0]
+	st	$hi,[$rp+$i*8+4]
+___
+}
+$code.=<<___;
+	mov	1,%i0		! return success
+.Lmdone_$NUM:
+	ret
+	restore
+
+.Lmabort_$NUM:
+	restore
+	restore
+	restore
+	restore
+	restore
+.Lmabort1_$NUM:
+	restore
+
+	mov	0,%i0		! return failure
+	ret
+	restore
+
+.align	32
+.Lmsquare_$NUM:
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+	.word   0x81b02940+$NUM-1	! montsqr	$NUM-1
+	ba	.Lmresume_$NUM
+	nop
+.type	bn_mul_mont_t4_$NUM, #function
+.size	bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
+___
+}
+
+for ($i=8;$i<=32;$i+=8) {
+	&generate_bn_mul_mont_t4($i);
+}
+
+########################################################################
+#
+sub load_ccr {
+my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
+$code.=<<___;
+	srl	$pwr,	2,	%o4
+	and	$pwr,	3,	%o5
+	and	%o4,	7,	%o4
+	sll	%o5,	3,	%o5	! offset within first cache line
+	add	%o5,	$ptbl,	$ptbl	! of the pwrtbl
+	or	%g0,	1,	%o5
+	sll	%o5,	%o4,	$ccr
+___
+$code.=<<___	if (!$skip_wr);
+	wr	$ccr,	%g0,	%ccr
+___
+}
+sub load_b_pair {
+my ($pwrtbl,$B0,$B1)=@_;
+
+$code.=<<___;
+	ldx	[$pwrtbl+0*32],	$B0
+	ldx	[$pwrtbl+8*32],	$B1
+	ldx	[$pwrtbl+1*32],	%o4
+	ldx	[$pwrtbl+9*32],	%o5
+	movvs	%icc,	%o4,	$B0
+	ldx	[$pwrtbl+2*32],	%o4
+	movvs	%icc,	%o5,	$B1
+	ldx	[$pwrtbl+10*32],%o5
+	move	%icc,	%o4,	$B0
+	ldx	[$pwrtbl+3*32],	%o4
+	move	%icc,	%o5,	$B1
+	ldx	[$pwrtbl+11*32],%o5
+	movneg	%icc,	%o4,	$B0
+	ldx	[$pwrtbl+4*32],	%o4
+	movneg	%icc,	%o5,	$B1
+	ldx	[$pwrtbl+12*32],%o5
+	movcs	%xcc,	%o4,	$B0
+	ldx	[$pwrtbl+5*32],%o4
+	movcs	%xcc,	%o5,	$B1
+	ldx	[$pwrtbl+13*32],%o5
+	movvs	%xcc,	%o4,	$B0
+	ldx	[$pwrtbl+6*32],	%o4
+	movvs	%xcc,	%o5,	$B1
+	ldx	[$pwrtbl+14*32],%o5
+	move	%xcc,	%o4,	$B0
+	ldx	[$pwrtbl+7*32],	%o4
+	move	%xcc,	%o5,	$B1
+	ldx	[$pwrtbl+15*32],%o5
+	movneg	%xcc,	%o4,	$B0
+	add	$pwrtbl,16*32,	$pwrtbl
+	movneg	%xcc,	%o5,	$B1
+___
+}
+sub load_b {
+my ($pwrtbl,$Bi)=@_;
+
+$code.=<<___;
+	ldx	[$pwrtbl+0*32],	$Bi
+	ldx	[$pwrtbl+1*32],	%o4
+	ldx	[$pwrtbl+2*32],	%o5
+	movvs	%icc,	%o4,	$Bi
+	ldx	[$pwrtbl+3*32],	%o4
+	move	%icc,	%o5,	$Bi
+	ldx	[$pwrtbl+4*32],	%o5
+	movneg	%icc,	%o4,	$Bi
+	ldx	[$pwrtbl+5*32],	%o4
+	movcs	%xcc,	%o5,	$Bi
+	ldx	[$pwrtbl+6*32],	%o5
+	movvs	%xcc,	%o4,	$Bi
+	ldx	[$pwrtbl+7*32],	%o4
+	move	%xcc,	%o5,	$Bi
+	add	$pwrtbl,8*32,	$pwrtbl
+	movneg	%xcc,	%o4,	$Bi
+___
+}
+
+########################################################################
+# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
+#			   const u64 *pwrtbl,int pwr,int stride);
+#
+sub generate_bn_pwr5_mont_t4() {
+my $NUM=shift;
+my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
+
+$code.=<<___;
+.globl	bn_pwr5_mont_t4_$NUM
+.align	32
+bn_pwr5_mont_t4_$NUM:
+#ifdef	__arch64__
+	mov	0,$sentinel
+	mov	-128,%g4
+#elif defined(SPARCV9_64BIT_STACK)
+	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
+	mov	-2047,%g4
+	and	%g1,SPARCV9_64BIT_STACK,%g1
+	movrz	%g1,0,%g4
+	mov	-1,$sentinel
+	add	%g4,-128,%g4
+#else
+	mov	-1,$sentinel
+	mov	-128,%g4
+#endif
+	sllx	$sentinel,32,$sentinel
+	save	%sp,%g4,%sp
+#ifndef	__arch64__
+	save	%sp,-128,%sp	! warm it up
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	save	%sp,-128,%sp
+	restore
+	restore
+	restore
+	restore
+	restore
+	restore
+#endif
+	and	%sp,1,%g4
+	or	$sentinel,%fp,%fp
+	or	%g4,$sentinel,$sentinel
+
+	! copy arguments to global registers
+	mov	%i0,$tp
+	mov	%i1,$np
+	ld	[%i2+0],%f1	! load *n0
+	ld	[%i2+4],%f0
+	mov	%i3,$pwrtbl
+	srl	%i4,%g0,%i4	! pack last arguments
+	sllx	%i5,32,$pwr
+	or	%i4,$pwr,$pwr
+	fsrc2	%f0,%f60
+___
+
+# load tp[$NUM] ########################################################
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+	ldx	[$tp+$i*8],@A[$i]
+___
+}
+for(; $i<$NUM; $i++) {
+$code.=<<___;
+	ldd	[$tp+$i*8],@A[$i]
+___
+}
+# load np[$NUM] ########################################################
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+	ldx	[$np+$i*8],@N[$i]
+___
+}
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for(; $i<28 && $i<$NUM; $i++) {
+$code.=<<___;
+	ldx	[$np+$i*8],@N[$i]
+___
+}
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i++) {
+$code.=<<___;
+	ldx	[$np+$i*8],@N[$i]
+___
+}
+# load pwrtbl[pwr] ########################################################
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+
+	srlx	$pwr,	32,	%o4		! unpack $pwr
+	srl	$pwr,	%g0,	%o5
+	sub	%o4,	5,	%o4
+	mov	$pwrtbl,	%o7
+	sllx	%o4,	32,	$pwr		! re-pack $pwr
+	or	%o5,	$pwr,	$pwr
+	srl	%o5,	%o4,	%o5
+___
+	&load_ccr("%o7","%o5","%o4");
+$code.=<<___;
+	b	.Lstride_$NUM
+	nop
+.align	16
+.Lstride_$NUM:
+___
+for($i=0; $i<14 && $i<$NUM; $i+=2) {
+	&load_b_pair("%o7",@B[$i],@B[$i+1]);
+}
+$code.=<<___;
+	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
+___
+for(; $i<$NUM; $i+=2) {
+	&load_b_pair("%i7",@B[$i],@B[$i+1]);
+}
+$code.=<<___;
+	srax	$pwr,	32,	%o4		! unpack $pwr
+	srl	$pwr,	%g0,	%o5
+	sub	%o4,	5,	%o4
+	mov	$pwrtbl,	%i7
+	sllx	%o4,	32,	$pwr		! re-pack $pwr
+	or	%o5,	$pwr,	$pwr
+	srl	%o5,	%o4,	%o5
+___
+	&load_ccr("%i7","%o5","%o4",1);
+
+# magic ################################################################
+for($i=0; $i<5; $i++) {
+$code.=<<___;
+	.word	0x81b02940+$NUM-1	! montsqr	$NUM-1
+	fbu,pn	%fcc3,.Labort_$NUM
+#ifndef	__arch64__
+	and	%fp,$sentinel,$sentinel
+	brz,pn	$sentinel,.Labort_$NUM
+#endif
+	nop
+___
+}
+$code.=<<___;
+	wr	%o4,	%g0,	%ccr
+	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
+	fbu,pn	%fcc3,.Labort_$NUM
+#ifndef	__arch64__
+	and	%fp,$sentinel,$sentinel
+	brz,pn	$sentinel,.Labort_$NUM
+#endif
+
+	srax	$pwr,	32,	%o4
+#ifdef	__arch64__
+	brgez	%o4,.Lstride_$NUM
+	restore
+	restore
+	restore
+	restore
+	restore
+#else
+	brgez	%o4,.Lstride_$NUM
+	restore;		and	%fp,$sentinel,$sentinel
+	restore;		and	%fp,$sentinel,$sentinel
+	restore;		and	%fp,$sentinel,$sentinel
+	restore;		and	%fp,$sentinel,$sentinel
+	 brz,pn	$sentinel,.Labort1_$NUM
+	restore
+#endif
+___
+
+# save tp[$NUM] ########################################################
+for($i=0; $i<14 && $i<$NUM; $i++) {
+$code.=<<___;
+	movxtod	@A[$i],@R[$i]
+___
+}
+$code.=<<___;
+#ifdef	__arch64__
+	restore
+#else
+	 and	%fp,$sentinel,$sentinel
+	restore
+	 and	$sentinel,1,%o7
+	 and	%fp,$sentinel,$sentinel
+	 srl	%fp,0,%fp		! just in case?
+	 or	%o7,$sentinel,$sentinel
+	brz,a,pn $sentinel,.Ldone_$NUM
+	mov	0,%i0		! return failure
+#endif
+___
+for($i=0; $i<$NUM; $i++) {
+$code.=<<___;
+	std	@R[$i],[$tp+$i*8]
+___
+}
+$code.=<<___;
+	mov	1,%i0		! return success
+.Ldone_$NUM:
+	ret
+	restore
+
+.Labort_$NUM:
+	restore
+	restore
+	restore
+	restore
+	restore
+.Labort1_$NUM:
+	restore
+
+	mov	0,%i0		! return failure
+	ret
+	restore
+.type	bn_pwr5_mont_t4_$NUM, #function
+.size	bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
+___
+}
+
+for ($i=8;$i<=32;$i+=8) {
+	&generate_bn_pwr5_mont_t4($i);
+}
+
+{
+########################################################################
+# Fall-back subroutines
+#
+# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
+#
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0";	# u64 *rp,
+$ap="%o1";	# const u64 *ap,
+$bp="%o2";	# const u64 *bp,
+$np="%o3";	# const u64 *np,
+$n0p="%o4";	# const BN_ULONG *n0,
+$num="%o5";	# int num);	# caller ensures that num is >=3
+$code.=<<___;
+.globl	bn_mul_mont_t4
+.align	32
+bn_mul_mont_t4:
+	add	%sp,	STACK_BIAS,	%g4	! real top of stack
+	sll	$num,	3,	$num		! size in bytes
+	add	$num,	63,	%g1
+	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
+	sub	%g4,	%g1,	%g1
+	andn	%g1,	63,	%g1		! align at 64 byte
+	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
+	sub	%g1,	%g4,	%g1
+
+	save	%sp,	%g1,	%sp
+___
+#	+-------------------------------+<-----	%sp
+#	.				.
+#	+-------------------------------+<-----	aligned at 64 bytes
+#	| __int64 tmp[0]		|
+#	+-------------------------------+
+#	.				.
+#	.				.
+#	+-------------------------------+<-----	aligned at 64 bytes
+#	.				.
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+	ld	[$n0p+0],	$t0	! pull n0[0..1] value
+	ld	[$n0p+4],	$t1
+	add	%sp, STACK_BIAS+STACK_FRAME, $tp
+	ldx	[$bp+0],	$m0	! m0=bp[0]
+	sllx	$t1,	32,	$n0
+	add	$bp,	8,	$bp
+	or	$t0,	$n0,	$n0
+
+	ldx	[$ap+0],	$aj	! ap[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
+	umulxhi	$aj,	$m0,	$hi0
+
+	ldx	[$ap+8],	$aj	! ap[1]
+	add	$ap,	16,	$ap
+	ldx	[$np+0],	$nj	! np[0]
+
+	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
+
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+
+	ldx	[$np+8],	$nj	! np[1]
+
+	addcc	$lo0,	$lo1,	$lo1
+	add	$np,	16,	$np
+	addxc	%g0,	$hi1,	$hi1
+
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.L1st
+	sub	$num,	24,	$cnt	! cnt=num-3
+
+.align	16
+.L1st:
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0
+
+	ldx	[$ap+0],	$aj	! ap[j]
+	addcc	$nlo,	$hi1,	$lo1
+	add	$ap,	8,	$ap
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+
+	ldx	[$np+0],	$nj	! np[j]
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
+	add	$np,	8,	$np
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addxc	%g0,	$hi1,	$hi1
+	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
+	add	$tp,	8,	$tp	! tp++
+
+	brnz,pt	$cnt,	.L1st
+	sub	$cnt,	8,	$cnt	! j--
+!.L1st
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	addxc	%g0,	$hi1,	$hi1
+	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
+	add	$tp,	8,	$tp
+
+	addcc	$hi0,	$hi1,	$hi1
+	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
+	stxa	$hi1,	[$tp]0xe2
+	add	$tp,	8,	$tp
+
+	ba	.Louter
+	sub	$num,	16,	$i	! i=num-2
+
+.align	16
+.Louter:
+	ldx	[$bp+0],	$m0	! m0=bp[i]
+	add	$bp,	8,	$bp
+
+	sub	$ap,	$num,	$ap	! rewind
+	sub	$np,	$num,	$np
+	sub	$tp,	$num,	$tp
+
+	ldx	[$ap+0],	$aj	! ap[0]
+	ldx	[$np+0],	$nj	! np[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
+	ldx	[$tp],		$tj	! tp[0]
+	umulxhi	$aj,	$m0,	$hi0
+	ldx	[$ap+8],	$aj	! ap[1]
+	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
+	addxc	%g0,	$hi0,	$hi0
+	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	add	$ap,	16,	$ap
+	umulxhi	$nj,	$m1,	$hi1
+	ldx	[$np+8],	$nj	! np[1]
+	add	$np,	16,	$np
+	addcc	$lo1,	$lo0,	$lo1
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	addxc	%g0,	$hi1,	$hi1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.Linner
+	sub	$num,	24,	$cnt	! cnt=num-3
+.align	16
+.Linner:
+	addcc	$alo,	$hi0,	$lo0
+	ldx	[$tp+8],	$tj	! tp[j]
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	ldx	[$ap+0],	$aj	! ap[j]
+	add	$ap,	8,	$ap
+	addcc	$nlo,	$hi1,	$lo1
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	ldx	[$np+0],	$nj	! np[j]
+	add	$np,	8,	$np
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addxc	%g0,	$hi0,	$hi0
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+	brnz,pt	$cnt,	.Linner
+	sub	$cnt,	8,	$cnt
+!.Linner
+	ldx	[$tp+8],	$tj	! tp[j]
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi0,	$hi0
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+
+	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
+	addxccc	$hi1,	$hi0,	$hi1
+	addxc	%g0,	%g0,	$ovf
+	stx	$hi1,	[$tp+8]
+	add	$tp,	16,	$tp
+
+	brnz,pt	$i,	.Louter
+	sub	$i,	8,	$i
+
+	sub	$ap,	$num,	$ap	! rewind
+	sub	$np,	$num,	$np
+	sub	$tp,	$num,	$tp
+	ba	.Lsub
+	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
+
+.align	16
+.Lsub:
+	ldx	[$tp],		$tj
+	add	$tp,	8,	$tp
+	ldx	[$np+0],	$nj
+	add	$np,	8,	$np
+	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
+	srlx	$tj,	32,	$tj
+	srlx	$nj,	32,	$nj
+	subccc	$tj,	$nj,	$t3
+	add	$rp,	8,	$rp
+	st	$t2,	[$rp-4]		! reverse order
+	st	$t3,	[$rp-8]
+	brnz,pt	$cnt,	.Lsub
+	sub	$cnt,	8,	$cnt
+
+	sub	$np,	$num,	$np	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$rp,	$num,	$rp
+
+	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
+	and	$tp,	$ovf,	$ap
+	andn	$rp,	$ovf,	$np
+	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
+	ba	.Lcopy
+	sub	$num,	8,	$cnt
+
+.align	16
+.Lcopy:					! copy or in-place refresh
+	ldx	[$ap+0],	$t2
+	add	$ap,	8,	$ap
+	stx	%g0,	[$tp]		! zap
+	add	$tp,	8,	$tp
+	stx	$t2,	[$rp+0]
+	add	$rp,	8,	$rp
+	brnz	$cnt,	.Lcopy
+	sub	$cnt,	8,	$cnt
+
+	mov	1,	%o0
+	ret
+	restore
+.type	bn_mul_mont_t4, #function
+.size	bn_mul_mont_t4, .-bn_mul_mont_t4
+___
+
+# int bn_mul_mont_gather5(
+$rp="%o0";	# u64 *rp,
+$ap="%o1";	# const u64 *ap,
+$bp="%o2";	# const u64 *pwrtbl,
+$np="%o3";	# const u64 *np,
+$n0p="%o4";	# const BN_ULONG *n0,
+$num="%o5";	# int num,	# caller ensures that num is >=3
+		# int power);
+$code.=<<___;
+.globl	bn_mul_mont_gather5_t4
+.align	32
+bn_mul_mont_gather5_t4:
+	add	%sp,	STACK_BIAS,	%g4	! real top of stack
+	sll	$num,	3,	$num		! size in bytes
+	add	$num,	63,	%g1
+	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
+	sub	%g4,	%g1,	%g1
+	andn	%g1,	63,	%g1		! align at 64 byte
+	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
+	sub	%g1,	%g4,	%g1
+	LDPTR	[%sp+STACK_7thARG],	%g4	! load power, 7th argument
+
+	save	%sp,	%g1,	%sp
+___
+#	+-------------------------------+<-----	%sp
+#	.				.
+#	+-------------------------------+<-----	aligned at 64 bytes
+#	| __int64 tmp[0]		|
+#	+-------------------------------+
+#	.				.
+#	.				.
+#	+-------------------------------+<-----	aligned at 64 bytes
+#	.				.
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+	&load_ccr($bp,"%g4",$ccr);
+	&load_b($bp,$m0,"%o7");		# m0=bp[0]
+
+$code.=<<___;
+	ld	[$n0p+0],	$t0	! pull n0[0..1] value
+	ld	[$n0p+4],	$t1
+	add	%sp, STACK_BIAS+STACK_FRAME, $tp
+	sllx	$t1,	32,	$n0
+	or	$t0,	$n0,	$n0
+
+	ldx	[$ap+0],	$aj	! ap[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
+	umulxhi	$aj,	$m0,	$hi0
+
+	ldx	[$ap+8],	$aj	! ap[1]
+	add	$ap,	16,	$ap
+	ldx	[$np+0],	$nj	! np[0]
+
+	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
+
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+
+	ldx	[$np+8],	$nj	! np[1]
+
+	addcc	$lo0,	$lo1,	$lo1
+	add	$np,	16,	$np
+	addxc	%g0,	$hi1,	$hi1
+
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.L1st_g5
+	sub	$num,	24,	$cnt	! cnt=num-3
+
+.align	16
+.L1st_g5:
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0
+
+	ldx	[$ap+0],	$aj	! ap[j]
+	addcc	$nlo,	$hi1,	$lo1
+	add	$ap,	8,	$ap
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+
+	ldx	[$np+0],	$nj	! np[j]
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
+	add	$np,	8,	$np
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addxc	%g0,	$hi1,	$hi1
+	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
+	add	$tp,	8,	$tp	! tp++
+
+	brnz,pt	$cnt,	.L1st_g5
+	sub	$cnt,	8,	$cnt	! j--
+!.L1st_g5
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	addxc	%g0,	$hi1,	$hi1
+	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
+	add	$tp,	8,	$tp
+
+	addcc	$hi0,	$hi1,	$hi1
+	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
+	stxa	$hi1,	[$tp]0xe2
+	add	$tp,	8,	$tp
+
+	ba	.Louter_g5
+	sub	$num,	16,	$i	! i=num-2
+
+.align	16
+.Louter_g5:
+	wr	$ccr,	%g0,	%ccr
+___
+	&load_b($bp,$m0);		# m0=bp[i]
+$code.=<<___;
+	sub	$ap,	$num,	$ap	! rewind
+	sub	$np,	$num,	$np
+	sub	$tp,	$num,	$tp
+
+	ldx	[$ap+0],	$aj	! ap[0]
+	ldx	[$np+0],	$nj	! np[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
+	ldx	[$tp],		$tj	! tp[0]
+	umulxhi	$aj,	$m0,	$hi0
+	ldx	[$ap+8],	$aj	! ap[1]
+	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
+	addxc	%g0,	$hi0,	$hi0
+	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	add	$ap,	16,	$ap
+	umulxhi	$nj,	$m1,	$hi1
+	ldx	[$np+8],	$nj	! np[1]
+	add	$np,	16,	$np
+	addcc	$lo1,	$lo0,	$lo1
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	addxc	%g0,	$hi1,	$hi1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.Linner_g5
+	sub	$num,	24,	$cnt	! cnt=num-3
+.align	16
+.Linner_g5:
+	addcc	$alo,	$hi0,	$lo0
+	ldx	[$tp+8],	$tj	! tp[j]
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	ldx	[$ap+0],	$aj	! ap[j]
+	add	$ap,	8,	$ap
+	addcc	$nlo,	$hi1,	$lo1
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	ldx	[$np+0],	$nj	! np[j]
+	add	$np,	8,	$np
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addxc	%g0,	$hi0,	$hi0
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+	brnz,pt	$cnt,	.Linner_g5
+	sub	$cnt,	8,	$cnt
+!.Linner_g5
+	ldx	[$tp+8],	$tj	! tp[j]
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi0,	$hi0
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+
+	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
+	addxccc	$hi1,	$hi0,	$hi1
+	addxc	%g0,	%g0,	$ovf
+	stx	$hi1,	[$tp+8]
+	add	$tp,	16,	$tp
+
+	brnz,pt	$i,	.Louter_g5
+	sub	$i,	8,	$i
+
+	sub	$ap,	$num,	$ap	! rewind
+	sub	$np,	$num,	$np
+	sub	$tp,	$num,	$tp
+	ba	.Lsub_g5
+	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
+
+.align	16
+.Lsub_g5:
+	ldx	[$tp],		$tj
+	add	$tp,	8,	$tp
+	ldx	[$np+0],	$nj
+	add	$np,	8,	$np
+	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
+	srlx	$tj,	32,	$tj
+	srlx	$nj,	32,	$nj
+	subccc	$tj,	$nj,	$t3
+	add	$rp,	8,	$rp
+	st	$t2,	[$rp-4]		! reverse order
+	st	$t3,	[$rp-8]
+	brnz,pt	$cnt,	.Lsub_g5
+	sub	$cnt,	8,	$cnt
+
+	sub	$np,	$num,	$np	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$rp,	$num,	$rp
+
+	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
+	and	$tp,	$ovf,	$ap
+	andn	$rp,	$ovf,	$np
+	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
+	ba	.Lcopy_g5
+	sub	$num,	8,	$cnt
+
+.align	16
+.Lcopy_g5:				! copy or in-place refresh
+	ldx	[$ap+0],	$t2
+	add	$ap,	8,	$ap
+	stx	%g0,	[$tp]		! zap
+	add	$tp,	8,	$tp
+	stx	$t2,	[$rp+0]
+	add	$rp,	8,	$rp
+	brnz	$cnt,	.Lcopy_g5
+	sub	$cnt,	8,	$cnt
+
+	mov	1,	%o0
+	ret
+	restore
+.type	bn_mul_mont_gather5_t4, #function
+.size	bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
+___
+}
+
+$code.=<<___;
+.globl	bn_flip_t4
+.align	32
+bn_flip_t4:
+.Loop_flip:
+	ld	[%o1+0],	%o4
+	sub	%o2,	1,	%o2
+	ld	[%o1+4],	%o5
+	add	%o1,	8,	%o1
+	st	%o5,	[%o0+0]
+	st	%o4,	[%o0+4]
+	brnz	%o2,	.Loop_flip
+	add	%o0,	8,	%o0
+	retl
+	nop
+.type	bn_flip_t4, #function
+.size	bn_flip_t4, .-bn_flip_t4
+
+.globl	bn_flip_n_scatter5_t4
+.align	32
+bn_flip_n_scatter5_t4:
+	sll	%o3,	3,	%o3
+	srl	%o1,	1,	%o1
+	add	%o3,	%o2,	%o2	! &pwrtbl[pwr]
+	sub	%o1,	1,	%o1
+.Loop_flip_n_scatter5:
+	ld	[%o0+0],	%o4	! inp[i]
+	ld	[%o0+4],	%o5
+	add	%o0,	8,	%o0
+	sllx	%o5,	32,	%o5
+	or	%o4,	%o5,	%o5
+	stx	%o5,	[%o2]
+	add	%o2,	32*8,	%o2
+	brnz	%o1,	.Loop_flip_n_scatter5
+	sub	%o1,	1,	%o1
+	retl
+	nop
+.type	bn_flip_n_scatter5_t4, #function
+.size	bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
+
+.globl	bn_gather5_t4
+.align	32
+bn_gather5_t4:
+___
+	&load_ccr("%o2","%o3","%g1");
+$code.=<<___;
+	sub	%o1,	1,	%o1
+.Loop_gather5:
+___
+	&load_b("%o2","%g1");
+$code.=<<___;
+	stx	%g1,	[%o0]
+	add	%o0,	8,	%o0
+	brnz	%o1,	.Loop_gather5
+	sub	%o1,	1,	%o1
+
+	retl
+	nop
+.type	bn_gather5_t4, #function
+.size	bn_gather5_t4, .-bn_gather5_t4
+
+.asciz	"Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
+.align	4
+___
+
+&emit_assembler();
+
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparcv9-gf2m.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,191 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# October 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: one suitable
+# for all SPARCv9 processors and one for VIS3-capable ones. Former
+# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
+# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
+# ~100-230% faster than gcc-generated code and ~35-90% faster than
+# the pure SPARCv9 code path.
+
+$locals=16*8;
+
+$tab="%l0";
+
+@T=("%g2","%g3");
+@i=("%g4","%g5");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
+($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
+
+$code.=<<___;
+#include <sparc_arch.h>
+#include <openssl/fipssyms.h>
+
+#ifdef __arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl	bn_GF2m_mul_2x2
+.align	16
+bn_GF2m_mul_2x2:
+        SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+        ld	[%g1+0],%g1             	! OPENSSL_sparcv9cap_P[0]
+
+        andcc	%g1, SPARCV9_VIS3, %g0
+        bz,pn	%icc,.Lsoftware
+        nop
+
+	sllx	%o1, 32, %o1
+	sllx	%o3, 32, %o3
+	or	%o2, %o1, %o1
+	or	%o4, %o3, %o3
+	.word	0x95b262ab			! xmulx   %o1, %o3, %o2
+	.word	0x99b262cb			! xmulxhi %o1, %o3, %o4
+	srlx	%o2, 32, %o1			! 13 cycles later
+	st	%o2, [%o0+0]
+	st	%o1, [%o0+4]
+	srlx	%o4, 32, %o3
+	st	%o4, [%o0+8]
+	retl
+	st	%o3, [%o0+12]
+
+.align	16
+.Lsoftware:
+	save	%sp,-STACK_FRAME-$locals,%sp
+
+	sllx	%i1,32,$a
+	mov	-1,$a12
+	sllx	%i3,32,$b
+	or	%i2,$a,$a
+	srlx	$a12,1,$a48			! 0x7fff...
+	or	%i4,$b,$b
+	srlx	$a12,2,$a12			! 0x3fff...
+	add	%sp,STACK_BIAS+STACK_FRAME,$tab
+
+	sllx	$a,2,$a4
+	mov	$a,$a1
+	sllx	$a,1,$a2
+
+	srax	$a4,63,@i[1]			! broadcast 61st bit
+	and	$a48,$a4,$a4			! (a<<2)&0x7fff...
+	srlx	$a48,2,$a48
+	srax	$a2,63,@i[0]			! broadcast 62nd bit
+	and	$a12,$a2,$a2			! (a<<1)&0x3fff...
+	srax	$a1,63,$lo			! broadcast 63rd bit
+	and	$a48,$a1,$a1			! (a<<0)&0x1fff...
+
+	sllx	$a1,3,$a8
+	and	$b,$lo,$lo
+	and	$b,@i[0],@i[0]
+	and	$b,@i[1],@i[1]
+
+	stx	%g0,[$tab+0*8]			! tab[0]=0
+	xor	$a1,$a2,$a12
+	stx	$a1,[$tab+1*8]			! tab[1]=a1
+	stx	$a2,[$tab+2*8]			! tab[2]=a2
+	 xor	$a4,$a8,$a48
+	stx	$a12,[$tab+3*8]			! tab[3]=a1^a2
+	 xor	$a4,$a1,$a1
+
+	stx	$a4,[$tab+4*8]			! tab[4]=a4
+	xor	$a4,$a2,$a2
+	stx	$a1,[$tab+5*8]			! tab[5]=a1^a4
+	xor	$a4,$a12,$a12
+	stx	$a2,[$tab+6*8]			! tab[6]=a2^a4
+	 xor	$a48,$a1,$a1
+	stx	$a12,[$tab+7*8]			! tab[7]=a1^a2^a4
+	 xor	$a48,$a2,$a2
+
+	stx	$a8,[$tab+8*8]			! tab[8]=a8
+	xor	$a48,$a12,$a12
+	stx	$a1,[$tab+9*8]			! tab[9]=a1^a8
+	 xor	$a4,$a1,$a1
+	stx	$a2,[$tab+10*8]			! tab[10]=a2^a8
+	 xor	$a4,$a2,$a2
+	stx	$a12,[$tab+11*8]		! tab[11]=a1^a2^a8
+
+	xor	$a4,$a12,$a12
+	stx	$a48,[$tab+12*8]		! tab[12]=a4^a8
+	 srlx	$lo,1,$hi
+	stx	$a1,[$tab+13*8]			! tab[13]=a1^a4^a8
+	 sllx	$lo,63,$lo
+	stx	$a2,[$tab+14*8]			! tab[14]=a2^a4^a8
+	 srlx	@i[0],2,@T[0]
+	stx	$a12,[$tab+15*8]		! tab[15]=a1^a2^a4^a8
+
+	sllx	@i[0],62,$a1
+	 sllx	$b,3,@i[0]
+	srlx	@i[1],3,@T[1]
+	 and	@i[0],`0xf<<3`,@i[0]
+	sllx	@i[1],61,$a2
+	 ldx	[$tab+@i[0]],@i[0]
+	 srlx	$b,4-3,@i[1]
+	xor	@T[0],$hi,$hi
+	 and	@i[1],`0xf<<3`,@i[1]
+	xor	$a1,$lo,$lo
+	 ldx	[$tab+@i[1]],@i[1]
+	xor	@T[1],$hi,$hi
+
+	xor	@i[0],$lo,$lo
+	srlx	$b,8-3,@i[0]
+	 xor	$a2,$lo,$lo
+	and	@i[0],`0xf<<3`,@i[0]
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	sllx	@i[1],`$n*4`,@T[0]
+	ldx	[$tab+@i[0]],@i[0]
+	srlx	@i[1],`64-$n*4`,@T[1]
+	xor	@T[0],$lo,$lo
+	srlx	$b,`($n+2)*4`-3,@i[1]
+	xor	@T[1],$hi,$hi
+	and	@i[1],`0xf<<3`,@i[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	sllx	@i[1],`$n*4`,@T[0]
+	ldx	[$tab+@i[0]],@i[0]
+	srlx	@i[1],`64-$n*4`,@T[1]
+	xor	@T[0],$lo,$lo
+
+	sllx	@i[0],`($n+1)*4`,@T[0]
+	 xor	@T[1],$hi,$hi
+	srlx	@i[0],`64-($n+1)*4`,@T[1]
+	xor	@T[0],$lo,$lo
+	xor	@T[1],$hi,$hi
+
+	srlx	$lo,32,%i1
+	st	$lo,[%i0+0]
+	st	%i1,[%i0+4]
+	srlx	$hi,32,%i2
+	st	$hi,[%i0+8]
+	st	%i2,[%i0+12]
+
+	ret
+	restore
+.type	bn_GF2m_mul_2x2,#function
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/sparcv9_modes.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,1691 @@
+#!/usr/bin/env perl
+
+# Specific modes implementations for SPARC Architecture 2011. There
+# is T4 dependency though, an ASI value that is not specified in the
+# Architecture Manual. But as SPARC universe is rather monocultural,
+# we imply that processor capable of executing crypto instructions
+# can handle the ASI in question as well. This means that we ought to
+# keep eyes open when new processors emerge...
+#
+# As for above mentioned ASI. It's so called "block initializing
+# store" which cancels "read" in "read-update-write" on cache lines.
+# This is "cooperative" optimization, as it reduces overall pressure
+# on memory interface. Benefits can't be observed/quantified with
+# usual benchmarks, on the contrary you can notice that single-thread
+# performance for parallelizable modes is ~1.5% worse for largest
+# block sizes [though few percent better for not so long ones]. All
+# this based on suggestions from David Miller.
+
+sub asm_init {		# to be called with @ARGV as argument
+    for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
+    if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
+    else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
+}
+
+# unified interface
+my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
+# local variables
+my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
+
+sub alg_cbc_encrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl	${alg}${bits}_t4_cbc_encrypt
+.align	32
+${alg}${bits}_t4_cbc_encrypt:
+	save		%sp, -$::frame, %sp
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .L${bits}_cbc_enc_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	sub		$inp, $out, $blk_init	! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+	andcc		$ivec, 7, $ivoff
+	alignaddr	$ivec, %g0, $ivec
+
+	ldd		[$ivec + 0], %f0	! load ivec
+	bz,pt		%icc, 1f
+	ldd		[$ivec + 8], %f2
+	ldd		[$ivec + 16], %f4
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+1:
+___
+$::code.=<<___ if ($::evp);
+	ld		[$ivec + 0], %f0
+	ld		[$ivec + 4], %f1
+	ld		[$ivec + 8], %f2
+	ld		[$ivec + 12], %f3
+___
+$::code.=<<___;
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	call		_${alg}${bits}_load_enckey
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		64, $iright
+	mov		0xff, $omask
+	sub		$iright, $ileft, $iright
+	and		$out, 7, $ooff
+	cmp		$len, 127
+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
+	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
+	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
+	srl		$omask, $ooff, $omask
+
+	alignaddrl	$out, %g0, $out
+	srlx		$len, 4, $len
+	prefetch	[$out], 22
+
+.L${bits}_cbc_enc_loop:
+	ldx		[$inp + 0], %o0
+	brz,pt		$ileft, 4f
+	ldx		[$inp + 8], %o1
+
+	ldx		[$inp + 16], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+4:
+	xor		%g4, %o0, %o0		! ^= rk[0]
+	xor		%g5, %o1, %o1
+	movxtod		%o0, %f12
+	movxtod		%o1, %f14
+
+	fxor		%f12, %f0, %f0		! ^= ivec
+	fxor		%f14, %f2, %f2
+	prefetch	[$out + 63], 22
+	prefetch	[$inp + 16+63], 20
+	call		_${alg}${bits}_encrypt_1x
+	add		$inp, 16, $inp
+
+	brnz,pn		$ooff, 2f
+	sub		$len, 1, $len
+		
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	brnz,pt		$len, .L${bits}_cbc_enc_loop
+	add		$out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+	st		%f0, [$ivec + 0]
+	st		%f1, [$ivec + 4]
+	st		%f2, [$ivec + 8]
+	st		%f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, 3f
+	nop
+
+	std		%f0, [$ivec + 0]	! write out ivec
+	std		%f2, [$ivec + 8]
+___
+$::code.=<<___;
+.L${bits}_cbc_enc_abort:
+	ret
+	restore
+
+.align	16
+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f4		! handle unaligned output
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+
+	stda		%f4, [$out + $omask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $omask, $omask
+	stda		%f8, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
+	orn		%g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+	st		%f0, [$ivec + 0]
+	st		%f1, [$ivec + 4]
+	st		%f2, [$ivec + 8]
+	st		%f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, 3f
+	nop
+
+	std		%f0, [$ivec + 0]	! write out ivec
+	std		%f2, [$ivec + 8]
+	ret
+	restore
+
+.align	16
+3:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
+	mov		0xff, $omask
+	srl		$omask, $ivoff, $omask
+	faligndata	%f0, %f0, %f4
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+	stda		%f4, [$ivec + $omask]0xc0
+	std		%f6, [$ivec + 8]
+	add		$ivec, 16, $ivec
+	orn		%g0, $omask, $omask
+	stda		%f8, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+	ret
+	restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align	32
+.L${bits}cbc_enc_blk:
+	add	$out, $len, $blk_init
+	and	$blk_init, 63, $blk_init	! tail
+	sub	$len, $blk_init, $len
+	add	$blk_init, 15, $blk_init	! round up to 16n
+	srlx	$len, 4, $len
+	srl	$blk_init, 4, $blk_init
+
+.L${bits}_cbc_enc_blk_loop:
+	ldx		[$inp + 0], %o0
+	brz,pt		$ileft, 5f
+	ldx		[$inp + 8], %o1
+
+	ldx		[$inp + 16], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+5:
+	xor		%g4, %o0, %o0		! ^= rk[0]
+	xor		%g5, %o1, %o1
+	movxtod		%o0, %f12
+	movxtod		%o1, %f14
+
+	fxor		%f12, %f0, %f0		! ^= ivec
+	fxor		%f14, %f2, %f2
+	prefetch	[$inp + 16+63], 20
+	call		_${alg}${bits}_encrypt_1x
+	add		$inp, 16, $inp
+	sub		$len, 1, $len
+		
+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
+	add		$out, 8, $out
+
+	membar		#StoreLoad|#StoreStore
+	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
+	mov		$blk_init, $len
+___
+$::code.=<<___ if ($::evp);
+	st		%f0, [$ivec + 0]
+	st		%f1, [$ivec + 4]
+	st		%f2, [$ivec + 8]
+	st		%f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, 3b
+	nop
+
+	std		%f0, [$ivec + 0]	! write out ivec
+	std		%f2, [$ivec + 8]
+___
+$::code.=<<___;
+	ret
+	restore
+.type	${alg}${bits}_t4_cbc_encrypt,#function
+.size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
+___
+}
+
+sub alg_cbc_decrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl	${alg}${bits}_t4_cbc_decrypt
+.align	32
+${alg}${bits}_t4_cbc_decrypt:
+	save		%sp, -$::frame, %sp
+	cmp		$len, 0
+	be,pn		$::size_t_cc, .L${bits}_cbc_dec_abort
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+	sub		$inp, $out, $blk_init	! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+	andcc		$ivec, 7, $ivoff
+	alignaddr	$ivec, %g0, $ivec
+
+	ldd		[$ivec + 0], %f12	! load ivec
+	bz,pt		%icc, 1f
+	ldd		[$ivec + 8], %f14
+	ldd		[$ivec + 16], %f0
+	faligndata	%f12, %f14, %f12
+	faligndata	%f14, %f0, %f14
+1:
+___
+$::code.=<<___ if ($::evp);
+	ld		[$ivec + 0], %f12	! load ivec
+	ld		[$ivec + 4], %f13
+	ld		[$ivec + 8], %f14
+	ld		[$ivec + 12], %f15
+___
+$::code.=<<___;
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	call		_${alg}${bits}_load_deckey
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		64, $iright
+	mov		0xff, $omask
+	sub		$iright, $ileft, $iright
+	and		$out, 7, $ooff
+	cmp		$len, 255
+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
+	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
+	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
+	srl		$omask, $ooff, $omask
+
+	andcc		$len, 16, %g0		! is number of blocks even?
+	srlx		$len, 4, $len
+	alignaddrl	$out, %g0, $out
+	bz		%icc, .L${bits}_cbc_dec_loop2x
+	prefetch	[$out], 22
+.L${bits}_cbc_dec_loop:
+	ldx		[$inp + 0], %o0
+	brz,pt		$ileft, 4f
+	ldx		[$inp + 8], %o1
+
+	ldx		[$inp + 16], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+4:
+	xor		%g4, %o0, %o2		! ^= rk[0]
+	xor		%g5, %o1, %o3
+	movxtod		%o2, %f0
+	movxtod		%o3, %f2
+
+	prefetch	[$out + 63], 22
+	prefetch	[$inp + 16+63], 20
+	call		_${alg}${bits}_decrypt_1x
+	add		$inp, 16, $inp
+
+	fxor		%f12, %f0, %f0		! ^= ivec
+	fxor		%f14, %f2, %f2
+	movxtod		%o0, %f12
+	movxtod		%o1, %f14
+
+	brnz,pn		$ooff, 2f
+	sub		$len, 1, $len
+		
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
+	add		$out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+	st		%f12, [$ivec + 0]
+	st		%f13, [$ivec + 4]
+	st		%f14, [$ivec + 8]
+	st		%f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
+	nop
+
+	std		%f12, [$ivec + 0]	! write out ivec
+	std		%f14, [$ivec + 8]
+___
+$::code.=<<___;
+.L${bits}_cbc_dec_abort:
+	ret
+	restore
+
+.align	16
+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f4		! handle unaligned output
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+
+	stda		%f4, [$out + $omask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $omask, $omask
+	stda		%f8, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
+	orn		%g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+	st		%f12, [$ivec + 0]
+	st		%f13, [$ivec + 4]
+	st		%f14, [$ivec + 8]
+	st		%f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
+	nop
+
+	std		%f12, [$ivec + 0]	! write out ivec
+	std		%f14, [$ivec + 8]
+___
+$::code.=<<___;
+	ret
+	restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align	32
+.L${bits}_cbc_dec_loop2x:
+	ldx		[$inp + 0], %o0
+	ldx		[$inp + 8], %o1
+	ldx		[$inp + 16], %o2
+	brz,pt		$ileft, 4f
+	ldx		[$inp + 24], %o3
+
+	ldx		[$inp + 32], %o4
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	or		%g1, %o0, %o0
+	sllx		%o1, $ileft, %o1
+	srlx		%o2, $iright, %g1
+	or		%g1, %o1, %o1
+	sllx		%o2, $ileft, %o2
+	srlx		%o3, $iright, %g1
+	or		%g1, %o2, %o2
+	sllx		%o3, $ileft, %o3
+	srlx		%o4, $iright, %o4
+	or		%o4, %o3, %o3
+4:
+	xor		%g4, %o0, %o4		! ^= rk[0]
+	xor		%g5, %o1, %o5
+	movxtod		%o4, %f0
+	movxtod		%o5, %f2
+	xor		%g4, %o2, %o4
+	xor		%g5, %o3, %o5
+	movxtod		%o4, %f4
+	movxtod		%o5, %f6
+
+	prefetch	[$out + 63], 22
+	prefetch	[$inp + 32+63], 20
+	call		_${alg}${bits}_decrypt_2x
+	add		$inp, 32, $inp
+
+	movxtod		%o0, %f8
+	movxtod		%o1, %f10
+	fxor		%f12, %f0, %f0		! ^= ivec
+	fxor		%f14, %f2, %f2
+	movxtod		%o2, %f12
+	movxtod		%o3, %f14
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	brnz,pn		$ooff, 2f
+	sub		$len, 2, $len
+		
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	std		%f4, [$out + 16]
+	std		%f6, [$out + 24]
+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
+	add		$out, 32, $out
+___
+$::code.=<<___ if ($::evp);
+	st		%f12, [$ivec + 0]
+	st		%f13, [$ivec + 4]
+	st		%f14, [$ivec + 8]
+	st		%f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
+	nop
+
+	std		%f12, [$ivec + 0]	! write out ivec
+	std		%f14, [$ivec + 8]
+___
+$::code.=<<___;
+	ret
+	restore
+
+.align	16
+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f8		! handle unaligned output
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+	faligndata	%f4, %f6, %f4
+	faligndata	%f6, %f6, %f6
+	stda		%f8, [$out + $omask]0xc0	! partial store
+	std		%f0, [$out + 8]
+	std		%f2, [$out + 16]
+	std		%f4, [$out + 24]
+	add		$out, 32, $out
+	orn		%g0, $omask, $omask
+	stda		%f6, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
+	orn		%g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+	st		%f12, [$ivec + 0]
+	st		%f13, [$ivec + 4]
+	st		%f14, [$ivec + 8]
+	st		%f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
+	nop
+
+	std		%f12, [$ivec + 0]	! write out ivec
+	std		%f14, [$ivec + 8]
+	ret
+	restore
+
+.align	16
+.L${bits}_cbc_dec_unaligned_ivec:
+	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
+	mov		0xff, $omask
+	srl		$omask, $ivoff, $omask
+	faligndata	%f12, %f12, %f0
+	faligndata	%f12, %f14, %f2
+	faligndata	%f14, %f14, %f4
+	stda		%f0, [$ivec + $omask]0xc0
+	std		%f2, [$ivec + 8]
+	add		$ivec, 16, $ivec
+	orn		%g0, $omask, $omask
+	stda		%f4, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+	ret
+	restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align	32
+.L${bits}cbc_dec_blk:
+	add	$out, $len, $blk_init
+	and	$blk_init, 63, $blk_init	! tail
+	sub	$len, $blk_init, $len
+	add	$blk_init, 15, $blk_init	! round up to 16n
+	srlx	$len, 4, $len
+	srl	$blk_init, 4, $blk_init
+	sub	$len, 1, $len
+	add	$blk_init, 1, $blk_init
+
+.L${bits}_cbc_dec_blk_loop2x:
+	ldx		[$inp + 0], %o0
+	ldx		[$inp + 8], %o1
+	ldx		[$inp + 16], %o2
+	brz,pt		$ileft, 5f
+	ldx		[$inp + 24], %o3
+
+	ldx		[$inp + 32], %o4
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	or		%g1, %o0, %o0
+	sllx		%o1, $ileft, %o1
+	srlx		%o2, $iright, %g1
+	or		%g1, %o1, %o1
+	sllx		%o2, $ileft, %o2
+	srlx		%o3, $iright, %g1
+	or		%g1, %o2, %o2
+	sllx		%o3, $ileft, %o3
+	srlx		%o4, $iright, %o4
+	or		%o4, %o3, %o3
+5:
+	xor		%g4, %o0, %o4		! ^= rk[0]
+	xor		%g5, %o1, %o5
+	movxtod		%o4, %f0
+	movxtod		%o5, %f2
+	xor		%g4, %o2, %o4
+	xor		%g5, %o3, %o5
+	movxtod		%o4, %f4
+	movxtod		%o5, %f6
+
+	prefetch	[$inp + 32+63], 20
+	call		_${alg}${bits}_decrypt_2x
+	add		$inp, 32, $inp
+	subcc		$len, 2, $len
+
+	movxtod		%o0, %f8
+	movxtod		%o1, %f10
+	fxor		%f12, %f0, %f0		! ^= ivec
+	fxor		%f14, %f2, %f2
+	movxtod		%o2, %f12
+	movxtod		%o3, %f14
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
+	add		$out, 8, $out
+
+	add		$blk_init, $len, $len
+	andcc		$len, 1, %g0		! is number of blocks even?
+	membar		#StoreLoad|#StoreStore
+	bnz,pt		%icc, .L${bits}_cbc_dec_loop
+	srl		$len, 0, $len
+	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
+	nop
+___
+$::code.=<<___ if ($::evp);
+	st		%f12, [$ivec + 0]	! write out ivec
+	st		%f13, [$ivec + 4]
+	st		%f14, [$ivec + 8]
+	st		%f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+	brnz,pn		$ivoff, 3b
+	nop
+
+	std		%f12, [$ivec + 0]	! write out ivec
+	std		%f14, [$ivec + 8]
+___
+$::code.=<<___;
+	ret
+	restore
+.type	${alg}${bits}_t4_cbc_decrypt,#function
+.size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
+___
+}
+
+sub alg_ctr32_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl	${alg}${bits}_t4_ctr32_encrypt
+.align	32
+${alg}${bits}_t4_ctr32_encrypt:
+	save		%sp, -$::frame, %sp
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	call		_${alg}${bits}_load_enckey
+	sllx		$len, 4, $len
+
+	ld		[$ivec + 0], %l4	! counter
+	ld		[$ivec + 4], %l5
+	ld		[$ivec + 8], %l6
+	ld		[$ivec + 12], %l7
+
+	sllx		%l4, 32, %o5
+	or		%l5, %o5, %o5
+	sllx		%l6, 32, %g1
+	xor		%o5, %g4, %g4		! ^= rk[0]
+	xor		%g1, %g5, %g5
+	movxtod		%g4, %f14		! most significant 64 bits
+
+	sub		$inp, $out, $blk_init	! $inp!=$out
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		64, $iright
+	mov		0xff, $omask
+	sub		$iright, $ileft, $iright
+	and		$out, 7, $ooff
+	cmp		$len, 255
+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
+	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
+	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
+	srl		$omask, $ooff, $omask
+
+	andcc		$len, 16, %g0		! is number of blocks even?
+	alignaddrl	$out, %g0, $out
+	bz		%icc, .L${bits}_ctr32_loop2x
+	srlx		$len, 4, $len
+.L${bits}_ctr32_loop:
+	ldx		[$inp + 0], %o0
+	brz,pt		$ileft, 4f
+	ldx		[$inp + 8], %o1
+
+	ldx		[$inp + 16], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+4:
+	xor		%g5, %l7, %g1		! ^= rk[0]
+	add		%l7, 1, %l7
+	movxtod		%g1, %f2
+	srl		%l7, 0, %l7		! clruw
+	prefetch	[$out + 63], 22
+	prefetch	[$inp + 16+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+	aes_eround01	%f16, %f14, %f2, %f4
+	aes_eround23	%f18, %f14, %f2, %f2
+___
+$::code.=<<___ if ($alg eq "cmll");
+	camellia_f	%f16, %f2, %f14, %f2
+	camellia_f	%f18, %f14, %f2, %f0
+___
+$::code.=<<___;
+	call		_${alg}${bits}_encrypt_1x+8
+	add		$inp, 16, $inp
+
+	movxtod		%o0, %f10
+	movxtod		%o1, %f12
+	fxor		%f10, %f0, %f0		! ^= inp
+	fxor		%f12, %f2, %f2
+
+	brnz,pn		$ooff, 2f
+	sub		$len, 1, $len
+		
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	brnz,pt		$len, .L${bits}_ctr32_loop2x
+	add		$out, 16, $out
+
+	ret
+	restore
+
+.align	16
+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f4		! handle unaligned output
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+	stda		%f4, [$out + $omask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $omask, $omask
+	stda		%f8, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
+	orn		%g0, $omask, $omask
+
+	ret
+	restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align	32
+.L${bits}_ctr32_loop2x:
+	ldx		[$inp + 0], %o0
+	ldx		[$inp + 8], %o1
+	ldx		[$inp + 16], %o2
+	brz,pt		$ileft, 4f
+	ldx		[$inp + 24], %o3
+
+	ldx		[$inp + 32], %o4
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	or		%g1, %o0, %o0
+	sllx		%o1, $ileft, %o1
+	srlx		%o2, $iright, %g1
+	or		%g1, %o1, %o1
+	sllx		%o2, $ileft, %o2
+	srlx		%o3, $iright, %g1
+	or		%g1, %o2, %o2
+	sllx		%o3, $ileft, %o3
+	srlx		%o4, $iright, %o4
+	or		%o4, %o3, %o3
+4:
+	xor		%g5, %l7, %g1		! ^= rk[0]
+	add		%l7, 1, %l7
+	movxtod		%g1, %f2
+	srl		%l7, 0, %l7		! clruw
+	xor		%g5, %l7, %g1
+	add		%l7, 1, %l7
+	movxtod		%g1, %f6
+	srl		%l7, 0, %l7		! clruw
+	prefetch	[$out + 63], 22
+	prefetch	[$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+	aes_eround01	%f16, %f14, %f2, %f8
+	aes_eround23	%f18, %f14, %f2, %f2
+	aes_eround01	%f16, %f14, %f6, %f10
+	aes_eround23	%f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+	camellia_f	%f16, %f2, %f14, %f2
+	camellia_f	%f16, %f6, %f14, %f6
+	camellia_f	%f18, %f14, %f2, %f0
+	camellia_f	%f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+	call		_${alg}${bits}_encrypt_2x+16
+	add		$inp, 32, $inp
+
+	movxtod		%o0, %f8
+	movxtod		%o1, %f10
+	movxtod		%o2, %f12
+	fxor		%f8, %f0, %f0		! ^= inp
+	movxtod		%o3, %f8
+	fxor		%f10, %f2, %f2
+	fxor		%f12, %f4, %f4
+	fxor		%f8, %f6, %f6
+
+	brnz,pn		$ooff, 2f
+	sub		$len, 2, $len
+		
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	std		%f4, [$out + 16]
+	std		%f6, [$out + 24]
+	brnz,pt		$len, .L${bits}_ctr32_loop2x
+	add		$out, 32, $out
+
+	ret
+	restore
+
+.align	16
+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f8		! handle unaligned output
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+	faligndata	%f4, %f6, %f4
+	faligndata	%f6, %f6, %f6
+
+	stda		%f8, [$out + $omask]0xc0	! partial store
+	std		%f0, [$out + 8]
+	std		%f2, [$out + 16]
+	std		%f4, [$out + 24]
+	add		$out, 32, $out
+	orn		%g0, $omask, $omask
+	stda		%f6, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
+	orn		%g0, $omask, $omask
+
+	ret
+	restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align	32
+.L${bits}_ctr32_blk:
+	add	$out, $len, $blk_init
+	and	$blk_init, 63, $blk_init	! tail
+	sub	$len, $blk_init, $len
+	add	$blk_init, 15, $blk_init	! round up to 16n
+	srlx	$len, 4, $len
+	srl	$blk_init, 4, $blk_init
+	sub	$len, 1, $len
+	add	$blk_init, 1, $blk_init
+
+.L${bits}_ctr32_blk_loop2x:
+	ldx		[$inp + 0], %o0
+	ldx		[$inp + 8], %o1
+	ldx		[$inp + 16], %o2
+	brz,pt		$ileft, 5f
+	ldx		[$inp + 24], %o3
+
+	ldx		[$inp + 32], %o4
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	or		%g1, %o0, %o0
+	sllx		%o1, $ileft, %o1
+	srlx		%o2, $iright, %g1
+	or		%g1, %o1, %o1
+	sllx		%o2, $ileft, %o2
+	srlx		%o3, $iright, %g1
+	or		%g1, %o2, %o2
+	sllx		%o3, $ileft, %o3
+	srlx		%o4, $iright, %o4
+	or		%o4, %o3, %o3
+5:
+	xor		%g5, %l7, %g1		! ^= rk[0]
+	add		%l7, 1, %l7
+	movxtod		%g1, %f2
+	srl		%l7, 0, %l7		! clruw
+	xor		%g5, %l7, %g1
+	add		%l7, 1, %l7
+	movxtod		%g1, %f6
+	srl		%l7, 0, %l7		! clruw
+	prefetch	[$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+	aes_eround01	%f16, %f14, %f2, %f8
+	aes_eround23	%f18, %f14, %f2, %f2
+	aes_eround01	%f16, %f14, %f6, %f10
+	aes_eround23	%f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+	camellia_f	%f16, %f2, %f14, %f2
+	camellia_f	%f16, %f6, %f14, %f6
+	camellia_f	%f18, %f14, %f2, %f0
+	camellia_f	%f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+	call		_${alg}${bits}_encrypt_2x+16
+	add		$inp, 32, $inp
+	subcc		$len, 2, $len
+
+	movxtod		%o0, %f8
+	movxtod		%o1, %f10
+	movxtod		%o2, %f12
+	fxor		%f8, %f0, %f0		! ^= inp
+	movxtod		%o3, %f8
+	fxor		%f10, %f2, %f2
+	fxor		%f12, %f4, %f4
+	fxor		%f8, %f6, %f6
+
+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
+	add		$out, 8, $out
+
+	add		$blk_init, $len, $len
+	andcc		$len, 1, %g0		! is number of blocks even?
+	membar		#StoreLoad|#StoreStore
+	bnz,pt		%icc, .L${bits}_ctr32_loop
+	srl		$len, 0, $len
+	brnz,pn		$len, .L${bits}_ctr32_loop2x
+	nop
+
+	ret
+	restore
+.type	${alg}${bits}_t4_ctr32_encrypt,#function
+.size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
+___
+}
+
+sub alg_xts_implement {
+my ($alg,$bits,$dir) = @_;
+my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
+my $rem=$ivec;
+
+$::code.=<<___;
+.globl	${alg}${bits}_t4_xts_${dir}crypt
+.align	32
+${alg}${bits}_t4_xts_${dir}crypt:
+	save		%sp, -$::frame-16, %sp
+	srln		$len, 0, $len		! needed on v8+, "nop" on v9
+
+	mov		$ivec, %o0
+	add		%fp, $::bias-16, %o1
+	call		${alg}_t4_encrypt
+	mov		$key2, %o2
+
+	add		%fp, $::bias-16, %l7
+	ldxa		[%l7]0x88, %g2
+	add		%fp, $::bias-8, %l7
+	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
+
+	sethi		%hi(0x76543210), %l7
+	or		%l7, %lo(0x76543210), %l7
+	bmask		%l7, %g0, %g0		! byte swap mask
+
+	prefetch	[$inp], 20
+	prefetch	[$inp + 63], 20
+	call		_${alg}${bits}_load_${dir}ckey
+	and		$len, 15,  $rem
+	and		$len, -16, $len
+___
+$code.=<<___ if ($dir eq "de");
+	mov		0, %l7
+	movrnz		$rem, 16,  %l7
+	sub		$len, %l7, $len
+___
+$code.=<<___;
+
+	sub		$inp, $out, $blk_init	! $inp!=$out
+	and		$inp, 7, $ileft
+	andn		$inp, 7, $inp
+	sll		$ileft, 3, $ileft
+	mov		64, $iright
+	mov		0xff, $omask
+	sub		$iright, $ileft, $iright
+	and		$out, 7, $ooff
+	cmp		$len, 255
+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
+	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
+	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
+	srl		$omask, $ooff, $omask
+
+	andcc		$len, 16, %g0		! is number of blocks even?
+___
+$code.=<<___ if ($dir eq "de");
+	brz,pn		$len, .L${bits}_xts_${dir}steal
+___
+$code.=<<___;
+	alignaddrl	$out, %g0, $out
+	bz		%icc, .L${bits}_xts_${dir}loop2x
+	srlx		$len, 4, $len
+.L${bits}_xts_${dir}loop:
+	ldx		[$inp + 0], %o0
+	brz,pt		$ileft, 4f
+	ldx		[$inp + 8], %o1
+
+	ldx		[$inp + 16], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+4:
+	movxtod		%g2, %f12
+	movxtod		%g3, %f14
+	bshuffle	%f12, %f12, %f12
+	bshuffle	%f14, %f14, %f14
+
+	xor		%g4, %o0, %o0		! ^= rk[0]
+	xor		%g5, %o1, %o1
+	movxtod		%o0, %f0
+	movxtod		%o1, %f2
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+
+	prefetch	[$out + 63], 22
+	prefetch	[$inp + 16+63], 20
+	call		_${alg}${bits}_${dir}crypt_1x
+	add		$inp, 16, $inp
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+
+	srax		%g3, 63, %l7		! next tweak value
+	addcc		%g2, %g2, %g2
+	and		%l7, 0x87, %l7
+	addxc		%g3, %g3, %g3
+	xor		%l7, %g2, %g2
+
+	brnz,pn		$ooff, 2f
+	sub		$len, 1, $len
+		
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
+	add		$out, 16, $out
+
+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
+	nop
+
+	ret
+	restore
+
+.align	16
+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f4		! handle unaligned output
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+	stda		%f4, [$out + $omask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $omask, $omask
+	stda		%f8, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
+	orn		%g0, $omask, $omask
+
+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
+	nop
+
+	ret
+	restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align	32
+.L${bits}_xts_${dir}loop2x:
+	ldx		[$inp + 0], %o0
+	ldx		[$inp + 8], %o1
+	ldx		[$inp + 16], %o2
+	brz,pt		$ileft, 4f
+	ldx		[$inp + 24], %o3
+
+	ldx		[$inp + 32], %o4
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	or		%g1, %o0, %o0
+	sllx		%o1, $ileft, %o1
+	srlx		%o2, $iright, %g1
+	or		%g1, %o1, %o1
+	sllx		%o2, $ileft, %o2
+	srlx		%o3, $iright, %g1
+	or		%g1, %o2, %o2
+	sllx		%o3, $ileft, %o3
+	srlx		%o4, $iright, %o4
+	or		%o4, %o3, %o3
+4:
+	movxtod		%g2, %f12
+	movxtod		%g3, %f14
+	bshuffle	%f12, %f12, %f12
+	bshuffle	%f14, %f14, %f14
+
+	srax		%g3, 63, %l7		! next tweak value
+	addcc		%g2, %g2, %g2
+	and		%l7, 0x87, %l7
+	addxc		%g3, %g3, %g3
+	xor		%l7, %g2, %g2
+
+	movxtod		%g2, %f8
+	movxtod		%g3, %f10
+	bshuffle	%f8,  %f8,  %f8
+	bshuffle	%f10, %f10, %f10
+
+	xor		%g4, %o0, %o0		! ^= rk[0]
+	xor		%g5, %o1, %o1
+	xor		%g4, %o2, %o2		! ^= rk[0]
+	xor		%g5, %o3, %o3
+	movxtod		%o0, %f0
+	movxtod		%o1, %f2
+	movxtod		%o2, %f4
+	movxtod		%o3, %f6
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+	fxor		%f8,  %f4, %f4		! ^= tweak[0]
+	fxor		%f10, %f6, %f6
+
+	prefetch	[$out + 63], 22
+	prefetch	[$inp + 32+63], 20
+	call		_${alg}${bits}_${dir}crypt_2x
+	add		$inp, 32, $inp
+
+	movxtod		%g2, %f8
+	movxtod		%g3, %f10
+
+	srax		%g3, 63, %l7		! next tweak value
+	addcc		%g2, %g2, %g2
+	and		%l7, 0x87, %l7
+	addxc		%g3, %g3, %g3
+	xor		%l7, %g2, %g2
+
+	bshuffle	%f8,  %f8,  %f8
+	bshuffle	%f10, %f10, %f10
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+	fxor		%f8,  %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	brnz,pn		$ooff, 2f
+	sub		$len, 2, $len
+		
+	std		%f0, [$out + 0]
+	std		%f2, [$out + 8]
+	std		%f4, [$out + 16]
+	std		%f6, [$out + 24]
+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
+	add		$out, 32, $out
+
+	fsrc2		%f4, %f0
+	fsrc2		%f6, %f2
+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
+	nop
+
+	ret
+	restore
+
+.align	16
+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
+						! and ~3x deterioration
+						! in inp==out case
+	faligndata	%f0, %f0, %f8		! handle unaligned output
+	faligndata	%f0, %f2, %f10
+	faligndata	%f2, %f4, %f12
+	faligndata	%f4, %f6, %f14
+	faligndata	%f6, %f6, %f0
+
+	stda		%f8, [$out + $omask]0xc0	! partial store
+	std		%f10, [$out + 8]
+	std		%f12, [$out + 16]
+	std		%f14, [$out + 24]
+	add		$out, 32, $out
+	orn		%g0, $omask, $omask
+	stda		%f0, [$out + $omask]0xc0	! partial store
+
+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
+	orn		%g0, $omask, $omask
+
+	fsrc2		%f4, %f0
+	fsrc2		%f6, %f2
+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
+	nop
+
+	ret
+	restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align	32
+.L${bits}_xts_${dir}blk:
+	add	$out, $len, $blk_init
+	and	$blk_init, 63, $blk_init	! tail
+	sub	$len, $blk_init, $len
+	add	$blk_init, 15, $blk_init	! round up to 16n
+	srlx	$len, 4, $len
+	srl	$blk_init, 4, $blk_init
+	sub	$len, 1, $len
+	add	$blk_init, 1, $blk_init
+
+.L${bits}_xts_${dir}blk2x:
+	ldx		[$inp + 0], %o0
+	ldx		[$inp + 8], %o1
+	ldx		[$inp + 16], %o2
+	brz,pt		$ileft, 5f
+	ldx		[$inp + 24], %o3
+
+	ldx		[$inp + 32], %o4
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	or		%g1, %o0, %o0
+	sllx		%o1, $ileft, %o1
+	srlx		%o2, $iright, %g1
+	or		%g1, %o1, %o1
+	sllx		%o2, $ileft, %o2
+	srlx		%o3, $iright, %g1
+	or		%g1, %o2, %o2
+	sllx		%o3, $ileft, %o3
+	srlx		%o4, $iright, %o4
+	or		%o4, %o3, %o3
+5:
+	movxtod		%g2, %f12
+	movxtod		%g3, %f14
+	bshuffle	%f12, %f12, %f12
+	bshuffle	%f14, %f14, %f14
+
+	srax		%g3, 63, %l7		! next tweak value
+	addcc		%g2, %g2, %g2
+	and		%l7, 0x87, %l7
+	addxc		%g3, %g3, %g3
+	xor		%l7, %g2, %g2
+
+	movxtod		%g2, %f8
+	movxtod		%g3, %f10
+	bshuffle	%f8,  %f8,  %f8
+	bshuffle	%f10, %f10, %f10
+
+	xor		%g4, %o0, %o0		! ^= rk[0]
+	xor		%g5, %o1, %o1
+	xor		%g4, %o2, %o2		! ^= rk[0]
+	xor		%g5, %o3, %o3
+	movxtod		%o0, %f0
+	movxtod		%o1, %f2
+	movxtod		%o2, %f4
+	movxtod		%o3, %f6
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+	fxor		%f8,  %f4, %f4		! ^= tweak[0]
+	fxor		%f10, %f6, %f6
+
+	prefetch	[$inp + 32+63], 20
+	call		_${alg}${bits}_${dir}crypt_2x
+	add		$inp, 32, $inp
+
+	movxtod		%g2, %f8
+	movxtod		%g3, %f10
+
+	srax		%g3, 63, %l7		! next tweak value
+	addcc		%g2, %g2, %g2
+	and		%l7, 0x87, %l7
+	addxc		%g3, %g3, %g3
+	xor		%l7, %g2, %g2
+
+	bshuffle	%f8,  %f8,  %f8
+	bshuffle	%f10, %f10, %f10
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+	fxor		%f8,  %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	subcc		$len, 2, $len
+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	add		$out, 8, $out
+	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
+	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
+	add		$out, 8, $out
+
+	add		$blk_init, $len, $len
+	andcc		$len, 1, %g0		! is number of blocks even?
+	membar		#StoreLoad|#StoreStore
+	bnz,pt		%icc, .L${bits}_xts_${dir}loop
+	srl		$len, 0, $len
+	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
+	nop
+
+	fsrc2		%f4, %f0
+	fsrc2		%f6, %f2
+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
+	nop
+
+	ret
+	restore
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+___
+$code.=<<___ if ($dir eq "en");
+.align	32
+.L${bits}_xts_${dir}steal:
+	std		%f0, [%fp + $::bias-16]	! copy of output
+	std		%f2, [%fp + $::bias-8]
+
+	srl		$ileft, 3, $ileft
+	add		%fp, $::bias-16, %l7
+	add		$inp, $ileft, $inp	! original $inp+$len&-15
+	add		$out, $ooff, $out	! original $out+$len&-15
+	mov		0, $ileft
+	nop					! align
+
+.L${bits}_xts_${dir}stealing:
+	ldub		[$inp + $ileft], %o0
+	ldub		[%l7  + $ileft], %o1
+	dec		$rem
+	stb		%o0, [%l7  + $ileft]
+	stb		%o1, [$out + $ileft]
+	brnz		$rem, .L${bits}_xts_${dir}stealing
+	inc		$ileft
+
+	mov		%l7, $inp
+	sub		$out, 16, $out
+	mov		0, $ileft
+	sub		$out, $ooff, $out
+	ba		.L${bits}_xts_${dir}loop	! one more time
+	mov		1, $len				! $rem is 0
+___
+$code.=<<___ if ($dir eq "de");
+.align	32
+.L${bits}_xts_${dir}steal:
+	ldx		[$inp + 0], %o0
+	brz,pt		$ileft, 8f
+	ldx		[$inp + 8], %o1
+
+	ldx		[$inp + 16], %o2
+	sllx		%o0, $ileft, %o0
+	srlx		%o1, $iright, %g1
+	sllx		%o1, $ileft, %o1
+	or		%g1, %o0, %o0
+	srlx		%o2, $iright, %o2
+	or		%o2, %o1, %o1
+8:
+	srax		%g3, 63, %l7		! next tweak value
+	addcc		%g2, %g2, %o2
+	and		%l7, 0x87, %l7
+	addxc		%g3, %g3, %o3
+	xor		%l7, %o2, %o2
+
+	movxtod		%o2, %f12
+	movxtod		%o3, %f14
+	bshuffle	%f12, %f12, %f12
+	bshuffle	%f14, %f14, %f14
+
+	xor		%g4, %o0, %o0		! ^= rk[0]
+	xor		%g5, %o1, %o1
+	movxtod		%o0, %f0
+	movxtod		%o1, %f2
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+
+	call		_${alg}${bits}_${dir}crypt_1x
+	add		$inp, 16, $inp
+
+	fxor		%f12, %f0, %f0		! ^= tweak[0]
+	fxor		%f14, %f2, %f2
+
+	std		%f0, [%fp + $::bias-16]
+	std		%f2, [%fp + $::bias-8]
+
+	srl		$ileft, 3, $ileft
+	add		%fp, $::bias-16, %l7
+	add		$inp, $ileft, $inp	! original $inp+$len&-15
+	add		$out, $ooff, $out	! original $out+$len&-15
+	mov		0, $ileft
+	add		$out, 16, $out
+	nop					! align
+
+.L${bits}_xts_${dir}stealing:
+	ldub		[$inp + $ileft], %o0
+	ldub		[%l7  + $ileft], %o1
+	dec		$rem
+	stb		%o0, [%l7  + $ileft]
+	stb		%o1, [$out + $ileft]
+	brnz		$rem, .L${bits}_xts_${dir}stealing
+	inc		$ileft
+
+	mov		%l7, $inp
+	sub		$out, 16, $out
+	mov		0, $ileft
+	sub		$out, $ooff, $out
+	ba		.L${bits}_xts_${dir}loop	! one more time
+	mov		1, $len				! $rem is 0
+___
+$code.=<<___;
+	ret
+	restore
+.type	${alg}${bits}_t4_xts_${dir}crypt,#function
+.size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
+___
+}
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %visopf = (	"faligndata"	=> 0x048,
+		"bshuffle"	=> 0x04c,
+		"fnot2"		=> 0x066,
+		"fxor"		=> 0x06c,
+		"fsrc2"		=> 0x078	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"addxc"		=> 0x011,
+		"addxccc"	=> 0x013,
+		"umulxhi"	=> 0x016,
+		"alignaddr"	=> 0x018,
+		"bmask"		=> 0x019,
+		"alignaddrl"	=> 0x01a	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub unaes_round {	# 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = (	"aes_eround01"	=> 0,
+		"aes_eround23"	=> 1,
+		"aes_dround01"	=> 2,
+		"aes_dround23"	=> 3,
+		"aes_eround01_l"=> 4,
+		"aes_eround23_l"=> 5,
+		"aes_dround01_l"=> 6,
+		"aes_dround23_l"=> 7,
+		"aes_kexpand1"	=> 8	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+    if (defined($opf=$aesopf{$mnemonic})) {
+	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub unaes_kexpand {	# 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %aesopf = (	"aes_kexpand0"	=> 0x130,
+		"aes_kexpand2"	=> 0x131	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if (defined($opf=$aesopf{$mnemonic})) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub uncamellia_f {	# 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
+my ($ref,$opf);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+    if (1) {
+	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub uncamellia3 {	# 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my ($ref,$opf);
+my %cmllopf = (	"camellia_fl"	=> 0x13c,
+		"camellia_fli"	=> 0x13d	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if (defined($opf=$cmllopf{$mnemonic})) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub unmovxtox {		# 2-argument instructions
+my ($mnemonic,$rs,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
+my ($ref,$opf);
+my %movxopf = (	"movdtox"	=> 0x110,
+		"movstouw"	=> 0x111,
+		"movstosw"	=> 0x113,
+		"movxtod"	=> 0x118,
+		"movwtos"	=> 0x119	);
+
+    $ref = "$mnemonic\t$rs,$rd";
+
+    if (defined($opf=$movxopf{$mnemonic})) {
+	foreach ($rs,$rd) {
+	    return $ref if (!/%([fgoli])([0-9]{1,2})/);
+	    $_=$bias{$1}+$2;
+	    if ($2>=32) {
+		return $ref if ($2&1);
+		# re-encode for upper double register addressing
+		$_=($2|$2>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+sub undes {
+my ($mnemonic)=shift;
+my @args=@_;
+my ($ref,$opf);
+my %desopf = (	"des_round"	=> 0b1001,
+		"des_ip"	=> 0b100110100,
+		"des_iip"	=> 0b100110101,
+		"des_kexpand"	=> 0b100110110	);
+
+    $ref = "$mnemonic\t".join(",",@_);
+
+    if (defined($opf=$desopf{$mnemonic})) {	# 4-arg
+	if ($mnemonic eq "des_round") {
+	    foreach (@args[0..3]) {
+		return $ref if (!/%f([0-9]{1,2})/);
+		$_=$1;
+		if ($1>=32) {
+		    return $ref if ($1&1);
+		    # re-encode for upper double register addressing
+		    $_=($1|$1>>5)&31;
+		}
+	    }
+	    return  sprintf ".word\t0x%08x !%s",
+			    2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
+			    $ref;
+	} elsif ($mnemonic eq "des_kexpand") {	# 3-arg
+	    foreach (@args[0..2]) {
+		return $ref if (!/(%f)?([0-9]{1,2})/);
+		$_=$2;
+		if ($2>=32) {
+		    return $ref if ($2&1);
+		    # re-encode for upper double register addressing
+		    $_=($2|$2>>5)&31;
+		}
+	    }
+	    return  sprintf ".word\t0x%08x !%s",
+			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
+			    $ref;
+	} else {				# 2-arg
+	    foreach (@args[0..1]) {
+		return $ref if (!/%f([0-9]{1,2})/);
+		$_=$1;
+		if ($1>=32) {
+		    return $ref if ($2&1);
+		    # re-encode for upper double register addressing
+		    $_=($1|$1>>5)&31;
+		}
+	    }
+	    return  sprintf ".word\t0x%08x !%s",
+			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
+			    $ref;
+	}
+    } else {
+	return $ref;
+    }
+}
+
+sub emit_assembler {
+    foreach (split("\n",$::code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
+
+	s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+		&unaes_round($1,$2,$3,$4,$5)
+	 /geo or
+	s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unaes_kexpand($1,$2,$3,$4)
+	 /geo or
+	s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+		&uncamellia_f($1,$2,$3,$4,$5)
+	 /geo or
+	s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&uncamellia3($1,$2,$3,$4)
+	 /geo or
+	s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
+		&undes($1,$2,$3,$4,$5)
+	 /geo or
+	s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
+		&unmovxtox($1,$2,$3)
+	 /geo or
+	s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
+		&unmovxtox($1,$2,$3)
+	 /geo or
+	s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unvis($1,$2,$3,$4)
+	 /geo or
+	s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+	 /geo;
+
+	print $_,"\n";
+    }
+}
+
+1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/inline-t4/vis3-mont.pl	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,375 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2012.
+#
+# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# onward. There are three new instructions used here: umulxhi,
+# addxc[cc] and initializing store. On T3 RSA private key operations
+# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
+# lengths. This is without dedicated squaring procedure. On T4
+# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
+# for reference purposes, because T4 has dedicated Montgomery
+# multiplication and squaring *instructions* that deliver even more.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$code.=<<___ if ($bits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+___
+$code.=<<___;
+#include <openssl/fipssyms.h>
+
+.section	".text",#alloc,#execinstr
+___
+
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0";	# BN_ULONG *rp,
+$ap="%o1";	# const BN_ULONG *ap,
+$bp="%o2";	# const BN_ULONG *bp,
+$np="%o3";	# const BN_ULONG *np,
+$n0p="%o4";	# const BN_ULONG *n0,
+$num="%o5";	# int num);	# caller ensures that num is even
+				# and >=6
+$code.=<<___;
+.globl	bn_mul_mont_vis3
+.align	32
+bn_mul_mont_vis3:
+	add	%sp,	$bias,	%g4	! real top of stack
+	sll	$num,	2,	$num	! size in bytes
+	add	$num,	63,	%g5
+	andn	%g5,	63,	%g5	! buffer size rounded up to 64 bytes
+	add	%g5,	%g5,	%g1
+	add	%g5,	%g1,	%g1	! 3*buffer size
+	sub	%g4,	%g1,	%g1
+	andn	%g1,	63,	%g1	! align at 64 byte
+	sub	%g1,	$frame,	%g1	! new top of stack
+	sub	%g1,	%g4,	%g1
+
+	save	%sp,	%g1,	%sp
+___
+
+#	+-------------------------------+<-----	%sp
+#	.				.
+#	+-------------------------------+<-----	aligned at 64 bytes
+#	| __int64 tmp[0]		|
+#	+-------------------------------+
+#	.				.
+#	.				.
+#	+-------------------------------+<----- aligned at 64 bytes
+#	| __int64 ap[1..0]		|	converted ap[]
+#	+-------------------------------+
+#	| __int64 np[1..0]		|	converted np[]
+#	+-------------------------------+
+#	| __int64 ap[3..2]		|
+#	.				.
+#	.				.
+#	+-------------------------------+
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+	ld	[$n0p+0],	$t0	! pull n0[0..1] value
+	add	%sp, $bias+$frame, $tp
+	ld	[$n0p+4],	$t1
+	add	$tp,	%g5,	$anp
+	ld	[$bp+0],	$t2	! m0=bp[0]
+	sllx	$t1,	32,	$n0
+	ld	[$bp+4],	$t3
+	or	$t0,	$n0,	$n0
+	add	$bp,	8,	$bp
+
+	ld	[$ap+0],	$t0	! ap[0]
+	sllx	$t3,	32,	$m0
+	ld	[$ap+4],	$t1
+	or	$t2,	$m0,	$m0
+
+	ld	[$ap+8],	$t2	! ap[1]
+	sllx	$t1,	32,	$aj
+	ld	[$ap+12],	$t3
+	or	$t0,	$aj,	$aj
+	add	$ap,	16,	$ap
+	stx	$aj,	[$anp]		! converted ap[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
+	umulxhi	$aj,	$m0,	$hi0
+
+	ld	[$np+0],	$t0	! np[0]
+	sllx	$t3,	32,	$aj
+	ld	[$np+4],	$t1
+	or	$t2,	$aj,	$aj
+
+	ld	[$np+8],	$t2	! np[1]
+	sllx	$t1,	32,	$nj
+	ld	[$np+12],	$t3
+	or	$t0, $nj,	$nj
+	add	$np,	16,	$np
+	stx	$nj,	[$anp+8]	! converted np[0]
+
+	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
+	stx	$aj,	[$anp+16]	! converted ap[1]
+
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+
+	sllx	$t3,	32,	$nj
+	or	$t2,	$nj,	$nj
+	stx	$nj,	[$anp+24]	! converted np[1]
+	add	$anp,	32,	$anp
+
+	addcc	$lo0,	$lo1,	$lo1
+	addxc	%g0,	$hi1,	$hi1
+
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.L1st
+	sub	$num,	24,	$cnt	! cnt=num-3
+
+.align	16
+.L1st:
+	ld	[$ap+0],	$t0	! ap[j]
+	addcc	$alo,	$hi0,	$lo0
+	ld	[$ap+4],	$t1
+	addxc	$aj,	%g0,	$hi0
+
+	sllx	$t1,	32,	$aj
+	add	$ap,	8,	$ap
+	or	$t0,	$aj,	$aj
+	stx	$aj,	[$anp]		! converted ap[j]
+
+	ld	[$np+0],	$t2	! np[j]
+	addcc	$nlo,	$hi1,	$lo1
+	ld	[$np+4],	$t3
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+
+	sllx	$t3,	32,	$nj
+	add	$np,	8,	$np
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
+	or	$t2,	$nj,	$nj
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	stx	$nj,	[$anp+8]	! converted np[j]
+	add	$anp,	16,	$anp	! anp++
+
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp	! tp++
+
+	brnz,pt	$cnt,	.L1st
+	sub	$cnt,	8,	$cnt	! j--
+!.L1st
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+
+	addcc	$hi0,	$hi1,	$hi1
+	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
+	stx	$hi1,	[$tp]
+	add	$tp,	8,	$tp
+
+	ba	.Louter
+	sub	$num,	16,	$i	! i=num-2
+
+.align	16
+.Louter:
+	ld	[$bp+0],	$t2	! m0=bp[i]
+	ld	[$bp+4],	$t3
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+
+	add	$bp,	8,	$bp
+	sllx	$t3,	32,	$m0
+	ldx	[$anp+0],	$aj	! ap[0]
+	or	$t2,	$m0,	$m0
+	ldx	[$anp+8],	$nj	! np[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
+	ldx	[$tp],		$tj	! tp[0]
+	umulxhi	$aj,	$m0,	$hi0
+	ldx	[$anp+16],	$aj	! ap[1]
+	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
+	addxc	%g0,	$hi0,	$hi0
+	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+	ldx	[$anp+24],	$nj	! np[1]
+	add	$anp,	32,	$anp
+	addcc	$lo1,	$lo0,	$lo1
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	addxc	%g0,	$hi1,	$hi1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.Linner
+	sub	$num,	24,	$cnt	! cnt=num-3
+.align	16
+.Linner:
+	addcc	$alo,	$hi0,	$lo0
+	ldx	[$tp+8],	$tj	! tp[j]
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	ldx	[$anp+0],	$aj	! ap[j]
+	addcc	$nlo,	$hi1,	$lo1
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	ldx	[$anp+8],	$nj	! np[j]
+	add	$anp,	16,	$anp
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addxc	%g0,	$hi0,	$hi0
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+	brnz,pt	$cnt,	.Linner
+	sub	$cnt,	8,	$cnt
+!.Linner
+	ldx	[$tp+8],	$tj	! tp[j]
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi0,	$hi0
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+
+	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
+	addxccc	$hi1,	$hi0,	$hi1
+	addxc	%g0,	%g0,	$ovf
+	stx	$hi1,	[$tp+8]
+	add	$tp,	16,	$tp
+
+	brnz,pt	$i,	.Louter
+	sub	$i,	8,	$i
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+	ba	.Lsub
+	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
+
+.align	16
+.Lsub:
+	ldx	[$tp],		$tj
+	add	$tp,	8,	$tp
+	ldx	[$anp+8],	$nj
+	add	$anp,	16,	$anp
+	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
+	srlx	$tj,	32,	$tj
+	srlx	$nj,	32,	$nj
+	subccc	$tj,	$nj,	$t3
+	add	$rp,	8,	$rp
+	st	$t2,	[$rp-4]		! reverse order
+	st	$t3,	[$rp-8]
+	brnz,pt	$cnt,	.Lsub
+	sub	$cnt,	8,	$cnt
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+	sub	$rp,	$num,	$rp
+
+	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
+	and	$tp,	$ovf,	$ap
+	andn	$rp,	$ovf,	$np
+	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
+	ba	.Lcopy
+	sub	$num,	8,	$cnt
+
+.align	16
+.Lcopy:					! copy or in-place refresh
+	ld	[$ap+0],	$t2
+	ld	[$ap+4],	$t3
+	add	$ap,	8,	$ap
+	stx	%g0,	[$tp]		! zap
+	add	$tp,	8,	$tp
+	stx	%g0,	[$anp]		! zap
+	stx	%g0,	[$anp+8]
+	add	$anp,	16,	$anp
+	st	$t3,	[$rp+0]		! flip order
+	st	$t2,	[$rp+4]
+	add	$rp,	8,	$rp
+	brnz	$cnt,	.Lcopy
+	sub	$cnt,	8,	$cnt
+
+	mov	1,	%o0
+	ret
+	restore
+.type	bn_mul_mont_vis3, #function
+.size	bn_mul_mont_vis3, .-bn_mul_mont_vis3
+.asciz  "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"addxc"		=> 0x011,
+		"addxccc"	=> 0x013,
+		"umulxhi"	=> 0x016	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-fips/patches/302-t4-inline.patch	Wed Nov 02 19:15:09 2016 -0700
@@ -0,0 +1,710 @@
+#
+# This file adds inline T4 instruction support to OpenSSL upstream code.
+# The change was brought in from OpenSSL 1.0.2.
+#
+Index: Configure
+===================================================================
+diff -ru openssl-1.0.1e/Configure openssl-1.0.1e/Configure
+--- a/Configure 2011-05-24 17:02:24.000000000 -0700
++++ b/Configure 2011-07-27 10:48:17.817470000 -0700
+@@ -129,7 +129,7 @@
+ 
+ my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:e_padlock-x86_64.o";
+ my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
+-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
+ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
+ my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
+Index: crypto/sparccpuid.S
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sparccpuid.S openssl-1.0.1e/crypto/sparccpuid.S
+--- a/crypto/sparccpuid.S 2011-05-24 17:02:24.000000000 -0700
++++ b/crypto/sparccpuid.S 2011-07-27 10:48:17.817470000 -0700
+@@ -255,7 +255,12 @@
+ !	UltraSPARC IIe		7
+ !	UltraSPARC III		7
+ !	UltraSPARC T1		24
++!	SPARC T4		65(*)
+ !
++! (*)	result has lesser to do with VIS instruction latencies, rdtick
++!	appears that slow, but it does the trick in sense that FP and
++!	VIS code paths are still slower than integer-only ones.
++!
+ ! Numbers for T2 and SPARC64 V-VII are more than welcomed.
+ !
+ ! It would be possible to detect specifically US-T1 by instrumenting
+@@ -264,6 +269,8 @@
+ .global	_sparcv9_vis1_instrument
+ .align	8
+ _sparcv9_vis1_instrument:
++	.word	0x81b00d80	!fxor	%f0,%f0,%f0
++	.word	0x85b08d82	!fxor	%f2,%f2,%f2
+ 	.word	0x91410000	!rd	%tick,%o0
+ 	.word	0x81b00d80	!fxor	%f0,%f0,%f0
+ 	.word	0x85b08d82	!fxor	%f2,%f2,%f2
+Index: crypto/sparcv9cap.c
+===================================================================
+--- openssl-fips-2.0.13/crypto/sparcv9cap.c.~1~	2016-06-20 12:49:42.000000000 -0700
++++ openssl-fips-2.0.13/crypto/sparcv9cap.c	2016-09-08 14:37:20.252604855 -0700
+@@ -4,41 +4,81 @@
+ #include <setjmp.h>
+ #include <signal.h>
+ #include <sys/time.h>
++#include <unistd.h>
+ #include <openssl/bn.h>
++#include "sparc_arch.h"
+ 
+-#define SPARCV9_TICK_PRIVILEGED	(1<<0)
+-#define SPARCV9_PREFER_FPU	(1<<1)
+-#define SPARCV9_VIS1		(1<<2)
+-#define SPARCV9_VIS2		(1<<3)	/* reserved */
+-#define SPARCV9_FMADD		(1<<4)	/* reserved for SPARC64 V */
+-#define SPARCV9_BLK		(1<<5)	/* VIS1 block copy */
++#if defined(__GNUC__) && defined(__linux)
++__attribute__((visibility("hidden")))
++#endif
+ 
+-static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
++extern unsigned OPENSSL_sparcv9cap_P[2];
+ 
+ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+-	{
+-	int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+-	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
++{
++    int bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
++    int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
++    int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+ 
+-	if (num>=8 && !(num&1) &&
+-	    (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+-		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
+-		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
+-	else
+-		return bn_mul_mont_int(rp,ap,bp,np,n0,num);
+-	}
++    if (!(num & 1) && num >= 6) {
++        if ((num & 15) == 0 && num <= 64 &&
++            (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
++            (CFR_MONTMUL | CFR_MONTSQR)) {
++            typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
++                                          const BN_ULONG *bp,
++                                          const BN_ULONG *np,
++                                          const BN_ULONG *n0);
++            int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap,
++                                 const BN_ULONG *bp, const BN_ULONG *np,
++                                 const BN_ULONG *n0);
++            int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
++                                  const BN_ULONG *bp, const BN_ULONG *np,
++                                  const BN_ULONG *n0);
++            int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
++                                  const BN_ULONG *bp, const BN_ULONG *np,
++                                  const BN_ULONG *n0);
++            int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
++                                  const BN_ULONG *bp, const BN_ULONG *np,
++                                  const BN_ULONG *n0);
++            static const bn_mul_mont_f funcs[4] = {
++                bn_mul_mont_t4_8, bn_mul_mont_t4_16,
++                bn_mul_mont_t4_24, bn_mul_mont_t4_32
++            };
++            bn_mul_mont_f worker = funcs[num / 16 - 1];
+ 
++            if ((*worker) (rp, ap, bp, np, n0))
++                return 1;
++            /* retry once and fall back */
++            if ((*worker) (rp, ap, bp, np, n0))
++                return 1;
++            return bn_mul_mont_vis3(rp, ap, bp, np, n0, num);
++        }
++        if ((OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3))
++            return bn_mul_mont_vis3(rp, ap, bp, np, n0, num);
++        else if (num >= 8 &&
++                 (OPENSSL_sparcv9cap_P[0] &
++                  (SPARCV9_PREFER_FPU | SPARCV9_VIS1)) ==
++                 (SPARCV9_PREFER_FPU | SPARCV9_VIS1))
++            return bn_mul_mont_fpu(rp, ap, bp, np, n0, num);
++    }
++    return bn_mul_mont_int(rp, ap, bp, np, n0, num);
++}
++
++
+ unsigned long	_sparcv9_rdtick(void);
+ void		_sparcv9_vis1_probe(void);
+ unsigned long	_sparcv9_vis1_instrument(void);
+ void		_sparcv9_vis2_probe(void);
+ void		_sparcv9_fmadd_probe(void);
++unsigned long _sparcv9_rdcfr(void);
++void _sparcv9_vis3_probe(void);
++unsigned long _sparcv9_random(void);
+ size_t 		_sparcv9_vis1_instrument_bus(unsigned int *,size_t);
+ size_t		_sparcv8_vis1_instrument_bus2(unsigned int *,size_t,size_t);
+ 
+ unsigned long OPENSSL_rdtsc(void)
+ 	{
+-	if (OPENSSL_sparcv9cap_P&SPARCV9_TICK_PRIVILEGED)
++	if (OPENSSL_sparcv9cap_P[0]&SPARCV9_TICK_PRIVILEGED)
+ #if defined(__sun) && defined(__SVR4)
+ 		return gethrtime();
+ #else
+@@ -50,7 +90,7 @@
+ 
+ size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt)
+ 	{
+-	if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+ 			SPARCV9_BLK)
+ 		return _sparcv9_vis1_instrument_bus(out,cnt);
+ 	else
+@@ -59,7 +99,7 @@
+ 
+ size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max)
+ 	{
+-	if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+ 			SPARCV9_BLK)
+ 		return _sparcv9_vis1_instrument_bus2(out,cnt,max);
+ 	else
+@@ -120,7 +160,9 @@
+ 
+ 	if ((e=getenv("OPENSSL_sparcv9cap")))
+ 		{
+-		OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
++		OPENSSL_sparcv9cap_P[0]=strtoul(e,NULL,0);
++		if ((e = strchr(e, ':')))
++			OPENSSL_sparcv9cap_P[1] = strtoul(e + 1, NULL, 0);
+ 		return;
+ 		}
+ 
+@@ -128,17 +170,17 @@
+ 		{
+ 		if (strcmp(si,"sun4v"))
+ 			/* FPU is preferred for all CPUs, but US-T1/2 */
+-			OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_PREFER_FPU;
+ 		}
+ 
+ 	if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
+ 		{
+ 		if (strstr(si,"+vis"))
+-			OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1|SPARCV9_BLK;
+ 		if (strstr(si,"+vis2"))
+ 			{
+-			OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+-			OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
++			OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ 			return;
+ 			}
+ 		}
+@@ -198,12 +240,14 @@
+  
+ 	if ((e=getenv("OPENSSL_sparcv9cap")))
+ 		{
+-		OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
++		OPENSSL_sparcv9cap_P[0]=strtoul(e,NULL,0);
++		if ((e = strchr(e, ':')))
++			OPENSSL_sparcv9cap_P[1] = strtoul(e + 1, NULL, 0);
+ 		return;
+ 		}
+ 
+ 	/* Initial value, fits UltraSPARC-I&II... */
+-	OPENSSL_sparcv9cap_P = SPARCV9_PREFER_FPU|SPARCV9_TICK_PRIVILEGED;
++	OPENSSL_sparcv9cap_P[0] = SPARCV9_PREFER_FPU|SPARCV9_TICK_PRIVILEGED;
+ 
+ 	sigfillset(&all_masked);
+ 	sigdelset(&all_masked,SIGILL);
+@@ -226,20 +270,20 @@
+ 	if (sigsetjmp(common_jmp,1) == 0)
+ 		{
+ 		_sparcv9_rdtick();
+-		OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++		OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ 		}
+ 
+ 	if (sigsetjmp(common_jmp,1) == 0)
+ 		{
+ 		_sparcv9_vis1_probe();
+-		OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1|SPARCV9_BLK;
+ 		/* detect UltraSPARC-Tx, see sparccpud.S for details... */
+ 		if (_sparcv9_vis1_instrument() >= 12)
+-			OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
++			OPENSSL_sparcv9cap_P[0] &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
+ 		else
+ 			{
+ 			_sparcv9_vis2_probe();
+-			OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
+ 			}
+ 		}
+ 
+@@ -246,13 +290,49 @@
+ 	if (sigsetjmp(common_jmp,1) == 0)
+ 		{
+ 		_sparcv9_fmadd_probe();
+-		OPENSSL_sparcv9cap_P |= SPARCV9_FMADD;
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_FMADD;
+ 		}
+ 
++	/*
++	 * VIS3 flag is tested independently from VIS1, unlike VIS2 that is,
++	 * because VIS3 defines even integer instructions.
++	 */
++	if (sigsetjmp(common_jmp,1) == 0) {
++		_sparcv9_vis3_probe();
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
++	}
++
++	if (sigsetjmp(common_jmp,1) == 0) {
++		(void)_sparcv9_random();
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_RANDOM;
++	}
++
++	/*
++	 * In wait for better solution _sparcv9_rdcfr is masked by
++	 * VIS3 flag, because it goes to uninterruptable endless
++	 * loop on UltraSPARC II running Solaris. Things might be
++	 * different on Linux...
++	 */
++	if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) &&
++		sigsetjmp(common_jmp, 1) == 0) {
++		OPENSSL_sparcv9cap_P[1] = (unsigned int)_sparcv9_rdcfr();
++	}
++
+ 	sigaction(SIGBUS,&bus_oact,NULL);
+ 	sigaction(SIGILL,&ill_oact,NULL);
+ 
+ 	sigprocmask(SIG_SETMASK,&oset,NULL);
++
++	if (sizeof(size_t) == 8)
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
++#ifdef __linux
++	else {
++		int ret = syscall(340);
++
++		if (ret >= 0 && ret & 1)
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
+ 	}
++#endif
++	}
+ 
+ #endif
+Index: crypto/sha/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sha/Makefile openssl-1.0.1e/crypto/sha/Makefile
+--- a/crypto/sha/Makefile    2011-05-24 17:02:24.000000000 -0700
++++ b/crypto/sha/Makefile    2011-07-27 10:48:17.817470000 -0700
+@@ -66,9 +66,9 @@
+ sha1-x86_64.s:	asm/sha1-x86_64.pl;	$(PERL) asm/sha1-x86_64.pl $(PERLASM_SCHEME) > $@
+ sha256-x86_64.s:asm/sha512-x86_64.pl;	$(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
+ sha512-x86_64.s:asm/sha512-x86_64.pl;	$(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
+-sha1-sparcv9.s:	asm/sha1-sparcv9.pl;	$(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS)
+-sha256-sparcv9.s:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
+-sha512-sparcv9.s:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
++sha1-sparcv9.S:	asm/sha1-sparcv9.pl;	$(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS)
++sha256-sparcv9.S:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
++sha512-sparcv9.S:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
+ 
+ sha1-ppc.s:	asm/sha1-ppc.pl;	$(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@
+ sha256-ppc.s:	asm/sha512-ppc.pl;	$(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
+Index: crypto/des/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/des/Makefile.orig openssl-1.0.1e/crypto/des/Makefile
+--- a/crypto/des/Makefile
++++ b/crypto/des/Makefile
+@@ -61,6 +61,10 @@ des: des.o cbc3_enc.o lib
+ 
+ des_enc-sparc.S:	asm/des_enc.m4
+ 	m4 -B 8192 asm/des_enc.m4 > des_enc-sparc.S
++dest4-sparcv9.S:	asm/dest4-sparcv9.pl
++	$(PERL) asm/dest4-sparcv9.pl $(CFLAGS) > $@
++aest4-sparcv9.o:	aest4-sparcv9.S
++	$(AS) $(ASFLAGS) -Wa,-n -o $@ $^
+ 
+ des-586.s:	asm/des-586.pl ../perlasm/x86asm.pl ../perlasm/cbc.pl
+ 	$(PERL) asm/des-586.pl $(PERLASM_SCHEME) $(CFLAGS) > $@
+Index: openssl/crypto/bn/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/bn/Makefile openssl-1.0.1e/crypto/bn/Makefile.new
+--- openssl-1.0.1e/crypto/bn/Makefile 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/bn/Makefile 2011-07-27 10:48:17.817470000 -0700
+@@ -77,6 +77,16 @@
+ 	$(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@
+ sparcv9-mont.s:		asm/sparcv9-mont.pl
+ 	$(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@
++vis3-mont.S:		asm/vis3-mont.pl
++	$(PERL) asm/vis3-mont.pl $(CFLAGS) > $@
++vis3-mont.o:	vis3-mont.S
++	$(AS) $(ASFLAGS) -Wa,-n -o $@ $^
++sparct4-mont.S:	asm/sparct4-mont.pl
++	$(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@
++sparct4-mont.o:	sparct4-mont.S
++	$(CC) $(CFLAGS) -Wa,-n -c -o $@ $^
++sparcv9-gf2m.S:	asm/sparcv9-gf2m.pl
++	$(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@
+ 
+ bn-mips3.o:	asm/mips3.s
+ 	@if [ "$(CC)" = "gcc" ]; then \
+Index: openssl/crypto/aes/Makefile
+===================================================================
+--- Makefile	Thu May  2 13:42:37 2013
++++ Makefile.orig	Thu May  2 13:41:51 2013
+@@ -69,6 +69,11 @@
+ aes-sparcv9.s: asm/aes-sparcv9.pl
+ 	$(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
+ 
++aest4-sparcv9.S: asm/aest4-sparcv9.pl
++	$(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@
++aest4-sparcv9.o: aest4-sparcv9.S
++	$(AS) $(ASFLAGS) -Wa,-n -o $@ $^
++
+ aes-ppc.s:	asm/aes-ppc.pl
+ 	$(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
+ aesp8-ppc.s:	asm/aesp8-ppc.pl
+Index: openssl/crypto/evp/evp.h
+===================================================================
+--- evp.h    Mon Feb 11 07:26:04 2013
++++ evp.h.new    Thu May  2 14:31:55 2013
+@@ -1282,6 +1282,7 @@
+ #define EVP_F_AESNI_INIT_KEY				 165
+ #define EVP_F_AESNI_XTS_CIPHER				 176
+ #define EVP_F_AES_INIT_KEY				 133
++#define EVP_F_AES_T4_INIT_KEY				 178
+ #define EVP_F_AES_XTS					 172
+ #define EVP_F_AES_XTS_CIPHER				 175
+ #define EVP_F_CAMELLIA_INIT_KEY				 159
+Index: openssl/crypto/bn/bn_exp.c
+===================================================================
+--- bn_exp.c.orig	2016-09-09 14:46:47.271555005 -0700
++++ bn_exp.c	2016-09-09 16:04:47.477700835 -0700
+@@ -124,8 +124,15 @@
+ # ifndef alloca
+ #  define alloca(s) __builtin_alloca((s))
+ # endif
++#else
++#include <alloca.h>
+ #endif
+ 
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++# include "sparc_arch.h"
++extern unsigned int OPENSSL_sparcv9cap_P[];
++#endif
++
+ /* maximum precomputation table size for *variable* sliding windows */
+ #define TABLE_SIZE	32
+ 
+@@ -468,7 +475,15 @@
+ 	wstart=bits-1;	/* The top bit of the window */
+ 	wend=0;		/* The bottom bit of the window */
+ 
++#if 1    /* by Shay Gueron's suggestion */
++	j = mont->N.top;    /* borrow j */
++	if (bn_wexpand(r,j) == NULL) goto err;
++	r->d[0] = (0-m->d[0])&BN_MASK2;        /* 2^(top*BN_BITS2) - m */
++	for(i=1;i<j;i++) r->d[i] = (~m->d[i])&BN_MASK2;
++	r->top = j;
++#else
+ 	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
++#endif
+ 	for (;;)
+ 		{
+ 		if (BN_is_bit_set(p,wstart) == 0)
+@@ -520,6 +535,17 @@
+ 		start=0;
+ 		if (wstart < 0) break;
+ 		}
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++	if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3|SPARCV9_PREFER_FPU)) {
++		j = mont->N.top;	/* borrow j */
++		val[0]->d[0] = 1;	/* borrow val[0] */
++		for (i=1;i<j;i++)
++			val[0]->d[i] = 0;
++		val[0]->top = j;
++		if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))
++			goto err;
++	} else
++#endif
+ 	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+ 	ret=1;
+ err:
+@@ -529,7 +555,26 @@
+ 	return(ret);
+ 	}
+ 
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos) {
++	BN_ULONG ret = 0;
++	int wordpos;
+ 
++	wordpos = bitpos / BN_BITS2;
++	bitpos %= BN_BITS2;
++	if (wordpos>=0 && wordpos < a->top) {
++		ret = a->d[wordpos]&BN_MASK2;
++		if (bitpos) {
++			ret >>= bitpos;
++			if (++wordpos < a->top)
++				ret |= a->d[wordpos]<<(BN_BITS2-bitpos);
++		}
++	}
++
++	return ret & BN_MASK2;
++}
++#endif
++
+ /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
+  * so that accessing any of these table values shows the same access pattern as far
+  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
+@@ -588,6 +633,9 @@
+ 	int powerbufLen = 0;
+ 	unsigned char *powerbuf=NULL;
+ 	BIGNUM tmp, am;
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++	unsigned int t4=0;
++#endif
+ 
+ 	bn_check_top(a);
+ 	bn_check_top(p);
+@@ -622,9 +670,17 @@
+ 
+ 	/* Get the window size to use with size of p. */
+ 	window = BN_window_bits_for_ctime_exponent_size(bits);
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++	if (window>=5 && (top&15)==0 && top<=64 &&
++	    (OPENSSL_sparcv9cap_P[1]&(CFR_MONTMUL|CFR_MONTSQR))==
++	    (CFR_MONTMUL|CFR_MONTSQR) && (t4=OPENSSL_sparcv9cap_P[0]))
++		window=5;
++	else
++#endif
+ #if defined(OPENSSL_BN_ASM_MONT5)
+ 	if (window==6 && bits<=1024) window=5;	/* ~5% improvement of 2048-bit RSA sign */
+ #endif
++	(void) 0;
+ 
+ 	/* Allocate a buffer large enough to hold all of the pre-computed
+ 	 * powers of am, am itself and tmp.
+@@ -657,9 +713,9 @@
+ 	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+ 
+ 	/* prepare a^0 in Montgomery domain */
+-#if 1
++#if 0
+  	if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))	goto err;
+-#else
++#else	/* by Shay Gueron's suggestion */
+ 	tmp.d[0] = (0-m->d[0])&BN_MASK2;	/* 2^(top*BN_BITS2) - m */
+ 	for (i=1;i<top;i++)
+ 		tmp.d[i] = (~m->d[i])&BN_MASK2;
+@@ -673,7 +729,122 @@
+ 		if (!BN_to_montgomery(&am,&am,mont,ctx))	goto err;
+ 		}
+ 	else	if (!BN_to_montgomery(&am,a,mont,ctx))		goto err;
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++    if (t4) {
++        typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np,
++            const BN_ULONG *n0,const void *table,int power,int bits);
++        int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np,
++            const BN_ULONG *n0,const void *table,int power,int bits);
++        int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np,
++            const BN_ULONG *n0,const void *table,int power,int bits);
++        int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np,
++            const BN_ULONG *n0,const void *table,int power,int bits);
++        int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np,
++            const BN_ULONG *n0,const void *table,int power,int bits);
++        static const bn_pwr5_mont_f pwr5_funcs[4] = {
++            bn_pwr5_mont_t4_8,    bn_pwr5_mont_t4_16,
++            bn_pwr5_mont_t4_24,    bn_pwr5_mont_t4_32 };
++        bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top/16-1];
+ 
++        typedef int (*bn_mul_mont_f)(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++        int bn_mul_mont_t4_8(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++        int bn_mul_mont_t4_16(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++        int bn_mul_mont_t4_24(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++        int bn_mul_mont_t4_32(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++        static const bn_mul_mont_f mul_funcs[4] = {
++            bn_mul_mont_t4_8,    bn_mul_mont_t4_16,
++            bn_mul_mont_t4_24,    bn_mul_mont_t4_32 };
++        bn_mul_mont_f mul_worker = mul_funcs[top/16-1];
++
++        void bn_mul_mont_vis3(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *bp,const BN_ULONG *np,
++            const BN_ULONG *n0,int num);
++        void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *bp,const BN_ULONG *np,
++            const BN_ULONG *n0,int num);
++        void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap,
++            const void *table,const BN_ULONG *np,
++            const BN_ULONG *n0,int num,int power);
++        void bn_flip_n_scatter5_t4(const BN_ULONG *inp,size_t num,
++            void *table,size_t power);
++        void bn_gather5_t4(BN_ULONG *out,size_t num,
++            void *table,size_t power);
++        void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num);
++
++        BN_ULONG *np=mont->N.d, *n0=mont->n0;
++        int stride = 5*(6-(top/16-1));    /* multiple of 5, but less than 32 */
++
++        /*
++         * BN_to_montgomery can contaminate words above .top
++         * [in BN_DEBUG[_DEBUG] build]...
++         */
++        for (i=am.top; i<top; i++)    am.d[i]=0;
++        for (i=tmp.top; i<top; i++)    tmp.d[i]=0;
++
++        bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,0);
++        bn_flip_n_scatter5_t4(am.d,top,powerbuf,1);
++        if (!(*mul_worker)(tmp.d,am.d,am.d,np,n0) &&
++        !(*mul_worker)(tmp.d,am.d,am.d,np,n0))
++        bn_mul_mont_vis3(tmp.d,am.d,am.d,np,n0,top);
++        bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,2);
++
++        for (i=3; i<32; i++) {
++        /* Calculate a^i = a^(i-1) * a */
++        if (!(*mul_worker)(tmp.d,tmp.d,am.d,np,n0) &&
++            !(*mul_worker)(tmp.d,tmp.d,am.d,np,n0))
++            bn_mul_mont_vis3(tmp.d,tmp.d,am.d,np,n0,top);
++        bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,i);
++        }
++
++        /* switch to 64-bit domain */
++        np = alloca(top*sizeof(BN_ULONG));
++        top /= 2;
++        bn_flip_t4(np,mont->N.d,top);
++
++        bits--;
++        for (wvalue=0, i=bits%5; i>=0; i--,bits--)
++        wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
++        bn_gather5_t4(tmp.d,top,powerbuf,wvalue);
++
++        /* Scan the exponent one window at a time starting from the most
++         * significant bits.
++         */
++        while (bits >= 0) {
++        if (bits < stride)
++            stride = bits+1;
++        bits -= stride;
++        wvalue = (bn_get_bits(p,bits+1));
++
++        if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride))
++            continue;
++        /* retry once and fall back */
++        if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride))
++            continue;
++
++        bits += stride-5;
++        wvalue >>= stride-5;
++        wvalue &= 31;
++        bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++        bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++        bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++        bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++        bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++        bn_mul_mont_gather5_t4(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
++        }
++
++        bn_flip_t4(tmp.d,tmp.d,top);
++        top *= 2;
++        /* back to 32-bit domain */
++        tmp.top=top;
++        bn_correct_top(&tmp);
++        OPENSSL_cleanse(np,top*sizeof(BN_ULONG));
++    } else
++#endif
+ #if defined(OPENSSL_BN_ASM_MONT5)
+     /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+      * specifically optimization of cache-timing attack countermeasures
+@@ -812,6 +983,15 @@
+ 	}
+ 
+  	/* Convert the final result from montgomery to standard format */
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++	if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3|SPARCV9_PREFER_FPU)) {
++		am.d[0] = 1;	/* borrow am */
++		for (i = 1; i < top; i++)
++			am.d[i] = 0;
++		if (!BN_mod_mul_montgomery(rr,&tmp,&am,mont,ctx))
++			goto err;
++	} else
++#endif
+ 	if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
+ 	ret=1;
+ err:
+Index: fips/fipssyms.h
+===================================================================
+--- a/fips/fipssyms.h	2016-10-14 11:12:58.496245385 -0700
++++ b/fips/fipssyms.h	2016-10-14 11:12:49.159899380 -0700
+@@ -476,19 +476,56 @@
+ #define SHA512_Update fips_sha512_update
+ #define SHA512_version fips_sha512_version
+ #define _shadow_DES_check_key fips__shadow_des_check_key
++#define aes_t4_decrypt fips_aes_t4_decrypt
++#define aes_t4_encrypt fips_aes_t4_encrypt
++#define aes_t4_set_decrypt_key fips_aes_t4_set_decrypt_key
++#define aes_t4_set_encrypt_key fips_aes_t4_set_encrypt_key
++#define aes128_t4_cbc_decrypt fips_aes128_t4_cbc_decrypt
++#define aes128_t4_cbc_encrypt fips_aes128_t4_cbc_encrypt
++#define aes128_t4_ctr32_encrypt fips_aes128_t4_ctr32_encrypt
++#define aes128_t4_xts_decrypt fips_aes128_t4_xts_decrypt
++#define aes128_t4_xts_encrypt fips_aes128_t4_xts_encrypt
++#define aes192_t4_cbc_decrypt fips_aes192_t4_cbc_decrypt
++#define aes192_t4_cbc_encrypt fips_aes192_t4_cbc_encrypt
++#define aes192_t4_ctr32_encrypt fips_aes192_t4_ctr32_encrypt
++#define aes256_t4_cbc_decrypt fips_aes256_t4_cbc_decrypt
++#define aes256_t4_cbc_encrypt fips_aes256_t4_cbc_encrypt
++#define aes256_t4_ctr32_encrypt fips_aes256_t4_ctr32_encrypt
++#define aes256_t4_xts_decrypt fips_aes256_t4_xts_decrypt
++#define aes256_t4_xts_encrypt fips_aes256_t4_xts_encrypt
++#define bn_GF2m_mul_2x2 fips_bn_GF2m_mul_2x2
+ #define bn_add_part_words fips_bn_add_part_words
+ #define bn_cmp_part_words fips_bn_cmp_part_words
+ #define bn_cmp_words fips_bn_cmp_words
+ #define bn_dup_expand fips_bn_dup_expand
+ #define bn_expand2 fips_bn_expand2
++#define bn_flip_n_scatter5_t4 fips_bn_flip_n_scatter5_t4
++#define bn_flip_t4 fips_bn_flip_t4
++#define bn_gather5_t4 fips_bn_gather5_t4
+ #define bn_mul_high fips_bn_mul_high
+ #define bn_mul_low_normal fips_bn_mul_low_normal
+ #define bn_mul_low_recursive fips_bn_mul_low_recursive
++#define bn_mul_mont_gather5_t4 fips_bn_mul_mont_gather5_t4
++#define bn_mul_mont_t4 fips_bn_mul_mont_t4
++#define bn_mul_mont_t4_8 fips_bn_mul_mont_t4_8
++#define bn_mul_mont_t4_16 fips_bn_mul_mont_t4_16
++#define bn_mul_mont_t4_24 fips_bn_mul_mont_t4_24
++#define bn_mul_mont_t4_32 fips_bn_mul_mont_t4_32
++#define bn_mul_mont_vis3 fips_bn_mul_mont_vis3
+ #define bn_mul_normal fips_bn_mul_normal
+ #define bn_mul_part_recursive fips_bn_mul_part_recursive
+ #define bn_mul_recursive fips_bn_mul_recursive
++#define bn_pwr5_mont_t4_8 fips_bn_pwr5_mont_t4_8
++#define bn_pwr5_mont_t4_16 fips_bn_pwr5_mont_t4_16
++#define bn_pwr5_mont_t4_24 fips_bn_pwr5_mont_t4_24
++#define bn_pwr5_mont_t4_32 fips_bn_pwr5_mont_t4_32
+ #define bn_sqr_normal fips_bn_sqr_normal
+ #define bn_sqr_recursive fips_bn_sqr_recursive
++#define des_t4_cbc_decrypt fips_des_t4_cbc_decrypt
++#define des_t4_cbc_encrypt fips_des_t4_cbc_encrypt
++#define des_t4_ede3_cbc_decrypt fips_des_t4_ede3_cbc_decrypt
++#define des_t4_ede3_cbc_encrypt fips_des_t4_ede3_cbc_encrypt
++#define des_t4_key_expand fips_des_t4_key_expand
+ #define dsa_builtin_paramgen fips_dsa_builtin_paramgen
+ #define dsa_builtin_paramgen2 fips_dsa_builtin_paramgen2
+ #define dsa_paramgen_check_g fips_dsa_paramgen_check_g
+Index: fips/sha/fips_standalone_sha1.c
+===================================================================
+--- a/fips/sha/fips_standalone_sha1.c	2016-06-20 12:49:46.000000000 -0700
++++ b/fips/sha/fips_standalone_sha1.c	2016-10-25 09:26:32.105775365 -0700
+@@ -60,6 +60,7 @@
+ void FIPS_selftest_check() {}
+ void OPENSSL_cleanse(void *p,size_t len) {}
+ unsigned int  OPENSSL_ia32cap_P[2];
++unsigned int  OPENSSL_sparcv9cap_P[2];
+ #endif
+ 
+ #ifdef OPENSSL_FIPS