15824599 SUNBT7206151 T4 hash should be embedded in the OpenSSL upstream src
authorMisaki Miyashita <Misaki.Miyashita@Oracle.COM>
Mon, 15 Apr 2013 09:23:35 -0700
changeset 1267 3d7359ef8168
parent 1266 1a59fbe869c6
child 1268 3c5ed0830c8e
15824599 SUNBT7206151 T4 hash should be embedded in the OpenSSL upstream src
components/openssl/README
components/openssl/openssl-1.0.1/Makefile
components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c
components/openssl/openssl-1.0.1/engines/t4/eng_t4.c
components/openssl/openssl-1.0.1/inline-t4/md5-sparcv9.pl
components/openssl/openssl-1.0.1/inline-t4/sparc_arch.h
components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch
components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch
--- a/components/openssl/README	Mon Apr 15 09:10:33 2013 -0700
+++ b/components/openssl/README	Mon Apr 15 09:23:35 2013 -0700
@@ -18,9 +18,10 @@
 #
 # CDDL HEADER END
 #
-# Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
 #
 
+
 Build Layout
 ---
 
@@ -33,6 +34,24 @@
 
 See also comments in all the Makefiles for more information.
 
+OpenSSL Version
+---
+
+For non-FIPS build, we currently deliver OpenSSL 1.0.1e with some updates
+from OpenSSL 1.0.2 to make T4 instructions embedded in the OpenSSL
+upstream code.  As of April 2013, 1.0.2 is not yet released, and therefore,
+we have decided to patch the code.
+The following files/code are copied in from 1.0.2.
+added:
+   components/openssl/openssl-1.0.1/inline-t4/md5-sparcv9.pl
+   components/openssl/openssl-1.0.1/inline-t4/sparc_arch.h
+   components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch
+TPNO for OpenSSL 1.0.1e is 13003.
+
+For FIPS build, we currently deliver OpenSSL 0.9.8y with OpenSSL FIPS module 2.1.
+TPNO for OpenSSL 0.9.8y is 13019.
+
+
 The non-fips Build.
 ---
 
@@ -98,19 +117,19 @@
     - should be ok - original detection seems broken, FPU gets never used
 - implementation of atoi()
 
-
-openssl-1.0.0d-aesni-v4.i386-patch
-X86-only patch.
-Add a built-in engine, aesni, to support X86 AES-NI instructions, along with
-files engines/aesni/aesni-x86[_64].pl.
-This patch is for OpenSSL 1.0.0d.  For newer OpenSSL versions, a newer patch
-may be needed.
+31_dtls_version.patch
+Fix DTLS_BAD_VER bug reported after OpenSSL 1.0.1e is released.
 
 openssl-1.0.0d-t4-engine.sparc-patch
 SPARC-only patch.
 Add a built-in engine, t4, to support SPARC T4 crypto instructions.
 along with files in directory engines/t4.
 
+openssl-t4-inline.sparc-patch
+SPARC-only patch.
+Add patch to support inline T4 instruction in OpenSSL upstream code until
+OpenSSL 1.0.2 is released.
+
 opensslconf.patch
 Modifies opensslconf.h so that it is suitable for both 32bit and 64bit installs.
 OpenSSL either builds for 32bit or 64bit - it doesn't allow for combined 32bit
--- a/components/openssl/openssl-1.0.1/Makefile	Mon Apr 15 09:10:33 2013 -0700
+++ b/components/openssl/openssl-1.0.1/Makefile	Mon Apr 15 09:23:35 2013 -0700
@@ -41,7 +41,8 @@
 COMPONENT_BUGDB=	utility/openssl
 
 # Architecture-specific patches
-EXTRA_PATCHES.sparc = $(PATCH_DIR)/openssl-1.0.1e-t4-engine.sparc-patch
+EXTRA_PATCHES.sparc = $(PATCH_DIR)/openssl-t4-inline.sparc-patch
+EXTRA_PATCHES.sparc += $(PATCH_DIR)/openssl-1.0.1e-t4-engine.sparc-patch
 EXTRA_PATCHES = $(EXTRA_PATCHES.$(MACH))
 
 include $(WS_TOP)/make-rules/prep.mk
@@ -179,7 +180,9 @@
       $(LN) -fs $(COMPONENT_DIR)/engines/t4/t4_des.S		$(@D)/crypto/des/asm; \
       $(LN) -fs $(COMPONENT_DIR)/engines/t4/t4_md5.S		$(@D)/crypto/md5/asm; \
       $(LN) -fs $(COMPONENT_DIR)/engines/t4/t4_sha?.S		$(@D)/crypto/sha/asm; \
-      $(LN) -fs $(COMPONENT_DIR)/wanboot-openssl/wanboot-stubs.c	$(@D)/crypto; )
+      $(LN) -fs $(COMPONENT_DIR)/wanboot-openssl/wanboot-stubs.c	$(@D)/crypto; \
+      $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparc_arch.h		$(@D)/crypto/; \
+      $(LN) -fs $(COMPONENT_DIR)/inline-t4/md5-sparcv9.pl		$(@D)/crypto/md5/asm; )
 
 # OpenSSL for wanboot is built on sparc only.
 ifeq ($(MACH), sparc)
--- a/components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c	Mon Apr 15 09:10:33 2013 -0700
+++ b/components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c	Mon Apr 15 09:23:35 2013 -0700
@@ -332,6 +332,9 @@
 static int check_hw_mechanisms(void);
 static int nid_in_table(int nid, int *nid_table);
 static int hw_aes_instruction_set_present(void);
+#if	defined(__sparc)
+static int hw_yf_digest_instruction_present(void);
+#endif
 #endif	/* SOLARIS_HW_SLOT_SELECTION */
 
 #define	TRY_OBJ_DESTROY(sp, obj_hdl, retval, uselock, alg_type)	\
@@ -911,13 +914,23 @@
 
 	if (!ENGINE_set_id(e, engine_pk11_id) ||
 	    !ENGINE_set_name(e, engine_pk11_name) ||
-	    !ENGINE_set_ciphers(e, pk11_engine_ciphers) ||
-	    !ENGINE_set_digests(e, pk11_engine_digests))
+	    !ENGINE_set_ciphers(e, pk11_engine_ciphers))
 		return (0);
 
 	if (!ENGINE_set_pkey_meths(e, pk11_engine_pkey_methods))
 		return (0);
 
+#if	defined(__sparc)
+	/*
+	 * Enable hash mechanisms for pkcs11 engine only if T4 digest
+	 * instruction is not present.
+	 */
+	if (!hw_yf_digest_instruction_present())
+#endif	/* defined(__sparc) */
+		if (!ENGINE_set_digests(e, pk11_engine_digests)) {
+			return (0);
+		}
+
 #ifndef OPENSSL_NO_RSA
 	if (pk11_have_rsa == CK_TRUE)
 		{
@@ -3797,6 +3810,24 @@
 	return (present);
 	}
 
+#if	defined(__sparc)
+static int
+hw_yf_digest_instruction_present(void)
+{
+	static int cached_result = -1;
+	uint_t ui = 0;
+
+	if (cached_result == -1) {
+		(void) getisax(&ui, 1);
+		cached_result = ((ui & AV_SPARC_MD5) != 0) &&
+		    ((ui & AV_SPARC_SHA1) != 0) &&
+		    ((ui & AV_SPARC_SHA256) != 0) &&
+		    ((ui & AV_SPARC_SHA512) != 0);
+	}
+	return (cached_result != 0);
+}
+#endif	/* defined(__sparc) */
+
 #endif	/* SOLARIS_HW_SLOT_SELECTION */
 
 #endif	/* OPENSSL_NO_HW_PK11 */
--- a/components/openssl/openssl-1.0.1/engines/t4/eng_t4.c	Mon Apr 15 09:10:33 2013 -0700
+++ b/components/openssl/openssl-1.0.1/engines/t4/eng_t4.c	Mon Apr 15 09:23:35 2013 -0700
@@ -168,13 +168,9 @@
 static t4_cipher_id get_cipher_index_by_nid(int nid);
 #pragma inline(get_cipher_index_by_nid)
 static void t4_instructions_present(_Bool *aes_present, _Bool *des_present,
-    _Bool *digest_present, _Bool *montmul_present);
+    _Bool *montmul_present);
 #pragma inline(t4_instructions_present)
 
-/* Digest registration function. Called by ENGINE_set_ciphers() */
-int t4_get_all_digests(ENGINE *e, const EVP_MD **digest,
-    const int **nids, int nid);
-
 /* RSA_METHOD structure used by ENGINE_set_RSA() */
 extern RSA_METHOD *t4_RSA(void);
 
@@ -402,170 +398,31 @@
 
 
 /*
- * Message Digest variables
- */
-static const int t4_digest_nids[] = {
-#ifndef	OPENSSL_NO_MD5
-	NID_md5,
-#endif
-#ifndef	OPENSSL_NO_SHA
-#ifndef	OPENSSL_NO_SHA1
-	NID_sha1,
-#endif
-#ifndef	OPENSSL_NO_SHA256
-	NID_sha224,
-	NID_sha256,
-#endif
-#ifndef	OPENSSL_NO_SHA512
-	NID_sha384,
-	NID_sha512,
-#endif
-#endif	/* !OPENSSL_NO_SHA */
-};
-static const int t4_digest_count =
-	(sizeof (t4_digest_nids) / sizeof (t4_digest_nids[0]));
-
-#ifndef	OPENSSL_NO_MD5
-extern const EVP_MD t4_md5;
-#endif
-#ifndef	OPENSSL_NO_SHA
-#ifndef	OPENSSL_NO_SHA1
-extern const EVP_MD t4_sha1;
-#endif
-#ifndef	OPENSSL_NO_SHA256
-extern const EVP_MD t4_sha224;
-extern const EVP_MD t4_sha256;
-#endif
-#ifndef	OPENSSL_NO_SHA512
-extern const EVP_MD t4_sha384;
-extern const EVP_MD t4_sha512;
-#endif
-#endif	/* !OPENSSL_NO_SHA */
-
-/*
- * Message Digest functions
- */
-
-/*
- * Registered by the ENGINE with ENGINE_set_digests().
- * Finds out how to deal with a particular digest NID in the ENGINE.
- */
-/* ARGSUSED */
-int
-t4_get_all_digests(ENGINE *e, const EVP_MD **digest,
-    const int **nids, int nid)
-{
-	if (digest == NULL) { /* return a list of all supported digests */
-		*nids = (t4_digest_count > 0) ? t4_digest_nids : NULL;
-		return (t4_digest_count);
-	}
-
-	switch (nid) {
-#ifndef	OPENSSL_NO_MD5
-	case NID_md5:
-		*digest = &t4_md5;
-		break;
-#endif
-#ifndef	OPENSSL_NO_SHA
-#ifndef	OPENSSL_NO_SHA1
-	/*
-	 * A special case. For "openssl dgst -dss1 ...",
-	 * OpenSSL calls EVP_get_digestbyname() on "dss1" which ends up
-	 * calling t4_get_all_digests() for NID_dsa. Internally, if an
-	 * engine is not used, OpenSSL uses SHA1_Init() as expected for
-	 * DSA. So, we must return t4_sha1 for NID_dsa as well. Note
-	 * that this must have changed between 0.9.8 and 1.0.0 since we
-	 * did not have the problem with the 0.9.8 version.
-	 */
-	case NID_dsa:
-	case NID_sha1:
-		*digest = &t4_sha1;
-		break;
-#endif
-#ifndef	OPENSSL_NO_SHA256
-	case NID_sha224:
-		*digest = &t4_sha224;
-		break;
-	case NID_sha256:
-		*digest = &t4_sha256;
-		break;
-#endif
-#ifndef	OPENSSL_NO_SHA512
-	case NID_sha384:
-		*digest = &t4_sha384;
-		break;
-	case NID_sha512:
-		*digest = &t4_sha512;
-		break;
-#endif
-#endif	/* !OPENSSL_NO_SHA */
-	default:
-		/* digest not supported */
-		*digest = NULL;
-		return (0);
-	}
-
-	return (1);
-}
-
-
-/*
  * Utility Functions
  */
 
 /*
- * Set aes_present, des_present, digest_present and montmul_present
- * to B_FALSE or B_TRUE depending on
- * whether the current SPARC processor supports AES, DES,
- * MD5/SHA1/SHA256/SHA512 and MONTMUL, respectively.
+ * Set aes_present, des_present and montmul_present to B_FALSE or B_TRUE
+ * depending on whether the current SPARC processor supports AES, DES
+ * and MONTMUL, respectively.
  */
 static void
 t4_instructions_present(_Bool *aes_present, _Bool *des_present,
-    _Bool *digest_present, _Bool *montmul_present)
+    _Bool *montmul_present)
 {
 #ifdef	OPENSSL_NO_DES
 #undef	AV_SPARC_DES
 #define	AV_SPARC_DES	0
 #endif
-#ifdef	OPENSSL_NO_MD5
-#undef	AV_SPARC_MD5
-#define	AV_SPARC_MD5	0
-#endif
-#ifndef	OPENSSL_NO_SHA
-#ifdef	OPENSSL_NO_SHA1
-#undef	AV_SPARC_SHA1
-#define	AV_SPARC_SHA1	0
-#endif
-#ifdef	OPENSSL_NO_SHA256
-#undef	AV_SPARC_SHA256
-#define	AV_SPARC_SHA256	0
-#endif
-#ifdef	OPENSSL_NO_SHA512
-#undef	AV_SPARC_SHA512
-#define	AV_SPARC_SHA512	0
-#endif
-#else
-#undef	AV_SPARC_SHA1
-#undef	AV_SPARC_SHA256
-#undef	AV_SPARC_SHA512
-#define	AV_SPARC_SHA1	0
-#define	AV_SPARC_SHA256	0
-#define	AV_SPARC_SHA512	0
-#endif	/* !OPENSSL_NO_SHA */
-
-#define	DIGEST_MASK	(AV_SPARC_MD5 | AV_SPARC_SHA1 | AV_SPARC_SHA256 | \
-	AV_SPARC_SHA512)
 	uint_t		ui;
 
 	(void) getisax(&ui, 1);
 	*aes_present = ((ui & AV_SPARC_AES) != 0);
 	*des_present = ((ui & AV_SPARC_DES) != 0);
-	*digest_present = ((ui & DIGEST_MASK) == DIGEST_MASK);
 	*montmul_present = ((ui & AV_SPARC_MONT) != 0);
 }
 
 
-
 /*
  * Cipher functions
  */
@@ -933,14 +790,12 @@
 static int
 t4_bind(ENGINE *e)
 {
-	_Bool aes_engage, digest_engage, des_engage, montmul_engage;
+	_Bool aes_engage, des_engage, montmul_engage;
 
-	t4_instructions_present(&aes_engage, &des_engage, &digest_engage,
-	    &montmul_engage);
+	t4_instructions_present(&aes_engage, &des_engage, &montmul_engage);
 #ifdef	DEBUG_T4
 	(void) fprintf(stderr,
-	    "t4_bind: engage aes=%d, des=%d, digest=%d\n",
-	    aes_engage, des_engage, digest_engage);
+	    "t4_bind: engage aes=%d, des=%d\n", aes_engage, des_engage);
 #endif
 #ifndef	OPENSSL_NO_DES
 	if (!des_engage) { /* Remove DES ciphers from list */
@@ -963,7 +818,6 @@
 	    aes_engage ? ENGINE_T4_NAME: ENGINE_NO_T4_NAME) ||
 	    !ENGINE_set_init_function(e, t4_init) ||
 	    (aes_engage && !ENGINE_set_ciphers(e, t4_get_all_ciphers)) ||
-	    (digest_engage && !ENGINE_set_digests(e, t4_get_all_digests)) ||
 #ifndef OPENSSL_NO_RSA
 	    (montmul_engage && !ENGINE_set_RSA(e, t4_RSA())) ||
 #endif	/* OPENSSL_NO_RSA */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-1.0.1/inline-t4/md5-sparcv9.pl	Mon Apr 15 09:23:35 2013 -0700
@@ -0,0 +1,434 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Hardware SPARC T4 support by David S. Miller <[email protected]>.
+# ====================================================================
+
+# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
+# code generated by Sun C 5.2.
+
+# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
+# faster than software. Multi-process benchmark saturates at 12x
+# single-process result on 8-core processor, or ~11GBps per 2.85GHz
+# socket.
+
+$bits=32;
+for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)	{ $bias=2047; $frame=192; }
+else		{ $bias=0;    $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+use integer;
+
+($ctx,$inp,$len)=("%i0","%i1","%i2");	# input arguments
+
+# 64-bit values
[email protected]=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
+$tx="%g3";
+($AB,$CD)=("%g4","%g5");
+
+# 32-bit values
[email protected]=($A,$B,$C,$D)=map("%l$_",(0..3));
+($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
+($shr,$shl1,$shl2)=("%i3","%i4","%i5");
+
+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0	);
+
+sub R0 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (7,12,17,22)[$i%4];
+  my $j   = ($i+1)/2;
+
+  if ($i&1) {
+    $code.=<<___;
+	 srlx	@X[$j],$shr,@X[$j]	! align X[`$i+1`]
+	and	$b,$t1,$t1		! round $i
+	 sllx	@X[$j+1],$shl1,$tx
+	add	$t2,$a,$a
+	 sllx	$tx,$shl2,$tx
+	xor	$d,$t1,$t1
+	 or	$tx,@X[$j],@X[$j]
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	@X[$j],$t2,$t2		! X[`$i+1`]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 xor	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+  } else {
+    $code.=<<___;
+	 srlx	@X[$j],32,$tx		! extract X[`2*$j+1`]
+	and	$b,$t1,$t1		! round $i
+	add	$t2,$a,$a
+	xor	$d,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$tx,$t2,$t2		! X[`2*$j+1`]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 xor	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+  }
+}
+
+sub R0_1 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (7,12,17,22)[$i%4];
+
+$code.=<<___;
+	 srlx	@X[0],32,$tx		! extract X[1]
+	and	$b,$t1,$t1		! round $i
+	add	$t2,$a,$a
+	xor	$d,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$tx,$t2,$t2		! X[1]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 andn	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+}
+
+sub R1 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (5,9,14,20)[$i%4];
+  my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
+  my $xi  = @X[$j/2];
+
+$code.=<<___ if ($j&1 && ($xi=$tx));
+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
+___
+$code.=<<___;
+	and	$b,$d,$t3		! round $i
+	add	$t2,$a,$a
+	or	$t3,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 `$i<31?"andn":"xor"`	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+}
+
+sub R2 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (4,11,16,23)[$i%4];
+  my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
+  my $xi  = @X[$j/2];
+
+$code.=<<___ if ($j&1 && ($xi=$tx));
+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
+___
+$code.=<<___;
+	add	$t2,$a,$a		! round $i
+	xor	$b,$t1,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	 or	$t2,%lo(@K[$i+1]),$t2
+	sll	$a,$rot,$t3
+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	 xor	 $b,$c,$t1
+	add	$t3,$a,$a
+___
+}
+
+sub R3 {
+  my ($i,$a,$b,$c,$d) = @_;
+  my $rot = (6,10,15,21)[$i%4];
+  my $j   = (0+7*($i+1))%16;
+  my $xi  = @X[$j/2];
+
+$code.=<<___;
+	add	$t2,$a,$a		! round $i
+___
+$code.=<<___ if ($j&1 && ($xi=$tx));
+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
+___
+$code.=<<___;
+	orn	$b,$d,$t1
+	 sethi	%hi(@K[$i+1]),$t2
+	xor	$c,$t1,$t1
+	 or	$t2,%lo(@K[$i+1]),$t2
+	add	$t1,$a,$a
+	sll	$a,$rot,$t3
+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
+	srl	$a,32-$rot,$a
+	add	$b,$t3,$t3
+	add	$t3,$a,$a
+___
+}
+
+$code.=<<___ if ($bits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+___
+$code.=<<___;
+#include "sparc_arch.h"
+
+.section	".text",#alloc,#execinstr
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl	md5_block_asm_data_order
+.align	32
+md5_block_asm_data_order:
+	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
+
+	andcc	%g1, CFR_MD5, %g0
+	be	.Lsoftware
+	nop
+
+	mov	4, %g1
+	andcc	%o1, 0x7, %g0
+	lda	[%o0 + %g0]0x88, %f0		! load context
+	lda	[%o0 + %g1]0x88, %f1
+	add	%o0, 8, %o0
+	lda	[%o0 + %g0]0x88, %f2
+	lda	[%o0 + %g1]0x88, %f3
+	bne,pn	%icc, .Lhwunaligned
+	sub	%o0, 8, %o0
+
+.Lhw_loop:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	subcc	%o2, 1, %o2		! done yet? 
+	ldd	[%o1 + 0x38], %f22
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	.word	0x81b02800		! MD5
+
+	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhw_loop
+	nop
+
+.Lhwfinish:
+	sta	%f0, [%o0 + %g0]0x88	! store context
+	sta	%f1, [%o0 + %g1]0x88
+	add	%o0, 8, %o0
+	sta	%f2, [%o0 + %g0]0x88
+	sta	%f3, [%o0 + %g1]0x88
+	retl
+	nop
+
+.align	8
+.Lhwunaligned:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+.Lhwunaligned_loop:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	subcc	%o2, 1, %o2		! done yet?
+	ldd	[%o1 + 0x40], %f26
+	add	%o1, 0x40, %o1
+	prefetch [%o1 + 63], 20
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	.word	0x81b02800		! MD5
+
+	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
+	for	%f26, %f26, %f10	! %f10=%f26
+
+	ba	.Lhwfinish
+	nop
+
+.align	16
+.Lsoftware:
+	save	%sp,-$frame,%sp
+
+	rd	%asi,$saved_asi
+	wr	%g0,0x88,%asi		! ASI_PRIMARY_LITTLE
+	and	$inp,7,$shr
+	andn	$inp,7,$inp
+
+	sll	$shr,3,$shr		! *=8
+	mov	56,$shl2
+	ld	[$ctx+0],$A
+	sub	$shl2,$shr,$shl2
+	ld	[$ctx+4],$B
+	and	$shl2,32,$shl1
+	add	$shl2,8,$shl2
+	ld	[$ctx+8],$C
+	sub	$shl2,$shl1,$shl2	! shr+shl1+shl2==64
+	ld	[$ctx+12],$D
+	nop
+
+.Loop:
+	 cmp	$shr,0			! was inp aligned?
+	ldxa	[$inp+0]%asi,@X[0]	! load little-endian input
+	ldxa	[$inp+8]%asi,@X[1]
+	ldxa	[$inp+16]%asi,@X[2]
+	ldxa	[$inp+24]%asi,@X[3]
+	ldxa	[$inp+32]%asi,@X[4]
+	 sllx	$A,32,$AB		! pack A,B
+	ldxa	[$inp+40]%asi,@X[5]
+	 sllx	$C,32,$CD		! pack C,D
+	ldxa	[$inp+48]%asi,@X[6]
+	 or	$B,$AB,$AB
+	ldxa	[$inp+56]%asi,@X[7]
+	 or	$D,$CD,$CD
+	bnz,a,pn	%icc,.+8
+	ldxa	[$inp+64]%asi,@X[8]
+
+	srlx	@X[0],$shr,@X[0]	! align X[0]
+	sllx	@X[1],$shl1,$tx
+	 sethi	%hi(@K[0]),$t2
+	sllx	$tx,$shl2,$tx
+	 or	$t2,%lo(@K[0]),$t2
+	or	$tx,@X[0],@X[0]
+	 xor	$C,$D,$t1
+	 add	@X[0],$t2,$t2		! X[0]+K[0]
+___
+	for ($i=0;$i<15;$i++)	{ &R0($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<16;$i++)	{ &R0_1($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<32;$i++)	{ &R1($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<48;$i++)	{ &R2($i,@V);	unshift(@V,pop(@V)); }
+	for (;$i<64;$i++)	{ &R3($i,@V);	unshift(@V,pop(@V)); }
+$code.=<<___;
+	srlx	$AB,32,$t1		! unpack A,B,C,D and accumulate
+	add	$inp,64,$inp		! advance inp
+	srlx	$CD,32,$t2
+	add	$t1,$A,$A
+	subcc	$len,1,$len		! done yet?
+	add	$AB,$B,$B
+	add	$t2,$C,$C
+	add	$CD,$D,$D
+	srl	$B,0,$B			! clruw	$B
+	bne	`$bits==64?"%xcc":"%icc"`,.Loop
+	srl	$D,0,$D			! clruw	$D
+
+	st	$A,[$ctx+0]		! write out ctx
+	st	$B,[$ctx+4]
+	st	$C,[$ctx+8]
+	st	$D,[$ctx+12]
+
+	wr	%g0,$saved_asi,%asi
+	ret
+	restore
+.type	md5_block_asm_data_order,#function
+.size	md5_block_asm_data_order,(.-md5_block_asm_data_order)
+
+.asciz	"MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
+my $ref,$opf;
+my %visopf = (	"faligndata"	=> 0x048,
+		"for"		=> 0x07c	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%f([0-9]{1,2})/);
+	    $_=$1;
+	    if ($1>=32) {
+		return $ref if ($1&1);
+		# re-encode for upper double register addressing
+		$_=($1|$1>>5)&31;
+	    }
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+sub unalignaddr {
+my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my $ref="$mnemonic\t$rs1,$rs2,$rd";
+
+    foreach ($rs1,$rs2,$rd) {
+	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
+	else			{ return $ref; }
+    }
+    return  sprintf ".word\t0x%08x !%s",
+		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
+		    $ref;
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+		&unvis($1,$2,$3,$4)
+	 /ge;
+	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unalignaddr($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-1.0.1/inline-t4/sparc_arch.h	Mon Apr 15 09:23:35 2013 -0700
@@ -0,0 +1,89 @@
+#ifndef __SPARC_ARCH_H__
+#define __SPARC_ARCH_H__
+
+#define SPARCV9_TICK_PRIVILEGED	(1<<0)
+#define SPARCV9_PREFER_FPU	(1<<1)
+#define SPARCV9_VIS1		(1<<2)
+#define SPARCV9_VIS2		(1<<3)	/* reserved */
+#define SPARCV9_FMADD		(1<<4)	/* reserved for SPARC64 V */
+#define SPARCV9_BLK		(1<<5)	/* VIS1 block copy */
+#define SPARCV9_VIS3		(1<<6)
+#define SPARCV9_RANDOM		(1<<7)
+
+/*
+ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
+ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
+ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
+ */
+#define CFR_AES		0x00000001 /* Supports AES opcodes     */
+#define CFR_DES		0x00000002 /* Supports DES opcodes     */
+#define CFR_KASUMI	0x00000004 /* Supports KASUMI opcodes  */
+#define CFR_CAMELLIA	0x00000008 /* Supports CAMELLIA opcodes*/
+#define CFR_MD5		0x00000010 /* Supports MD5 opcodes     */
+#define CFR_SHA1	0x00000020 /* Supports SHA1 opcodes    */
+#define CFR_SHA256	0x00000040 /* Supports SHA256 opcodes  */
+#define CFR_SHA512	0x00000080 /* Supports SHA512 opcodes  */
+#define CFR_MPMUL	0x00000100 /* Supports MPMUL opcodes   */
+#define CFR_MONTMUL	0x00000200 /* Supports MONTMUL opcodes */
+#define CFR_MONTSQR	0x00000400 /* Supports MONTSQR opcodes */
+#define CFR_CRC32C	0x00000800 /* Supports CRC32C opcodes  */
+
+#if defined(OPENSSL_PIC) && !defined(__PIC__)
+# define __PIC__
+#endif
+
+#define SPARC_PIC_THUNK(reg)	\
+	.align	32;		\
+.Lpic_thunk:			\
+	jmp	%o7 + 8;	\
+	 add	%o7, reg, reg;
+
+#define SPARC_PIC_THUNK_CALL(reg)			\
+	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), reg;	\
+	call	.Lpic_thunk;				\
+	 or	reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
+
+#if 1
+# define SPARC_SETUP_GOT_REG(reg)	SPARC_PIC_THUNK_CALL(reg)
+#else
+# define SPARC_SETUP_GOT_REG(reg)	\
+	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), reg;	\
+	call	.+8;					\
+	or	reg,%lo(_GLOBAL_OFFSET_TABLE_+4), reg;	\
+	add	%o7, reg, reg
+#endif
+
+#if	(defined(__GNUC__) && defined(__arch64__)) || \
+	(defined(__SUNPRO_C) && defined(__sparcv9))
+
+# define SPARC_LOAD_ADDRESS(SYM, reg)	\
+	setx	SYM, %o7, reg;
+# define LDPTR	ldx
+
+#else
+
+# define SPARC_LOAD_ADDRESS(SYM, reg)	\
+	set	SYM, reg;
+# define LDPTR	ld
+# define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg)
+
+#endif
+
+#ifdef __PIC__
+# undef	SPARC_LOAD_ADDRESS
+# undef SPARC_LOAD_ADDRESS_LEAF
+# define SPARC_LOAD_ADDRESS(SYM, reg)	\
+	SPARC_SETUP_GOT_REG(reg);	\
+	sethi	%hi(SYM), %o7;		\
+	or	%o7, %lo(SYM), %o7;	\
+	LDPTR	[reg + %o7], reg;
+#endif
+
+#ifndef SPARC_LOAD_ADDRESS_LEAF
+# define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp)	\
+	mov	%o7, tmp;			\
+	SPARC_LOAD_ADDRESS(SYM, reg)		\
+	mov	tmp, %o7;
+#endif
+
+#endif	/* __SPARC_ARCH_H__ */
--- a/components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch	Mon Apr 15 09:10:33 2013 -0700
+++ b/components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch	Mon Apr 15 09:23:35 2013 -0700
@@ -11,10 +11,10 @@
 
  my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:";
  my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
--my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 -my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
-+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
-+my $sparcv9_fips_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_fips_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
 +my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o t4_des.o:t4_aes.o::t4_md5.o:t4_sha1.o t4_sha2.o:::::::void";
  my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
  my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
@@ -154,9 +154,9 @@
  GENERAL=Makefile
  TEST=md5test.c
  APPS=
[email protected]@ -52,6 +58,10 @@
- 	$(CC) $(CFLAGS) -E asm/md5-ia64.S | \
- 	$(PERL) -ne 's/;\s+/;\n/g; print;' > [email protected]
[email protected]@ -55,6 +59,10 @@
+ md5-sparcv9.S:	asm/md5-sparcv9.pl
+ 	$(PERL) asm/md5-sparcv9.pl [email protected] $(CFLAGS)
  
 +t4_md5.o: asm/t4_md5.S
 +	as $(ASFLAGSYF) -o [email protected] asm/t4_md5.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch	Mon Apr 15 09:23:35 2013 -0700
@@ -0,0 +1,854 @@
+#
+# This file addds inline T4 instruction support to OpenSSL upstream code.
+#
+Index: Configure
+===================================================================
+diff -ru openssl-1.0.1e/Configure openssl-1.0.1e/Configure
+--- openssl-1.0.1e/Configure 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/Configure 2011-07-27 10:48:17.817470000 -0700
[email protected]@ -135,7 +135,7 @@
+
+ my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:";
+ my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
+-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
+ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
+ my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
+Index: crypto/sparccpuid.S
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sparccpuid.S openssl-1.0.1e/crypto/sparccpuid.S
+--- openssl-1.0.1e/crypto/sparccpuid.S 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/sparccpuid.S 2011-07-27 10:48:17.817470000 -0700
[email protected]@ -251,6 +251,11 @@
+ !	UltraSPARC IIe		7
+ !	UltraSPARC III		7
+ !	UltraSPARC T1		24
++!	SPARC T4		65(*)
++!
++! (*)	result has lesser to do with VIS instruction latencies, rdtick
++!	appears that slow, but it does the trick in sense that FP and
++!	VIS code paths are still slower than integer-only ones.
+ !
+ ! Numbers for T2 and SPARC64 V-VII are more than welcomed.
+ !
[email protected]@ -260,6 +265,8 @@
+ .global	_sparcv9_vis1_instrument
+ .align	8
+ _sparcv9_vis1_instrument:
++	.word	0x81b00d80	!fxor	%f0,%f0,%f0
++	.word	0x85b08d82	!fxor	%f2,%f2,%f2
+ 	.word	0x91410000	!rd	%tick,%o0
+ 	.word	0x81b00d80	!fxor	%f0,%f0,%f0
+ 	.word	0x85b08d82	!fxor	%f2,%f2,%f2
[email protected]@ -314,6 +321,30 @@
+ .type	_sparcv9_fmadd_probe,#function
+ .size	_sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
+ 
++.global	_sparcv9_rdcfr
++.align	8
++_sparcv9_rdcfr:
++	retl
++	.word	0x91468000	!rd	%asr26,%o0
++.type	_sparcv9_rdcfr,#function
++.size	_sparcv9_rdcfr,.-_sparcv9_rdcfr
++
++.global	_sparcv9_vis3_probe
++.align	8
++_sparcv9_vis3_probe:
++	retl
++	.word	0x81b022a0	!xmulx	%g0,%g0,%g0
++.type	_sparcv9_vis3_probe,#function
++.size	_sparcv9_vis3_probe,.-_sparcv9_vis3_probe
++
++.global	_sparcv9_random
++.align	8
++_sparcv9_random:
++	retl
++	.word	0x91b002a0	!random	%o0
++.type	_sparcv9_random,#function
++.size	_sparcv9_random,.-_sparcv9_vis3_probe
++
+ .global	OPENSSL_cleanse
+ .align	32
+ OPENSSL_cleanse:
+
+Index: crypto/sparcv9cap.c
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sparcv9cap.c openssl-1.0.1e/crypto/sparcv9cap.c
+--- openssl-1.0.1e/crypto/sparcv9cap.c 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/sparcv9cap.c 2011-07-27 10:48:17.817470000 -0700
[email protected]@ -6,17 +6,12 @@
+ #include <sys/time.h>
+ #include <openssl/bn.h>
+ 
+-#define SPARCV9_TICK_PRIVILEGED	(1<<0)
+-#define SPARCV9_PREFER_FPU	(1<<1)
+-#define SPARCV9_VIS1		(1<<2)
+-#define SPARCV9_VIS2		(1<<3)	/* reserved */
+-#define SPARCV9_FMADD		(1<<4)	/* reserved for SPARC64 V */
++#include "sparc_arch.h"
+ 
+-#ifndef	_BOOT
+-static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
+-#else
+-static int OPENSSL_sparcv9cap_P = SPARCV9_VIS1;
++#if defined(__GNUC__) && defined(__linux)
++__attribute__((visibility("hidden")))
+ #endif
++unsigned int OPENSSL_sparcv9cap_P[2]={SPARCV9_TICK_PRIVILEGED,0};
+ 
+ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+ 	{
[email protected]@ -24,7 +19,7 @@
+ 	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+ 
+ 	if (num>=8 && !(num&1) &&
+-	    (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
++	    (OPENSSL_sparcv9cap_P[0]&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+ 		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
+ 		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
+ 	else
[email protected]@ -36,11 +31,15 @@
+ unsigned long	_sparcv9_vis1_instrument(void);
+ void		_sparcv9_vis2_probe(void);
+ void		_sparcv9_fmadd_probe(void);
++unsigned long	_sparcv9_rdcfr(void);
++void		_sparcv9_vis3_probe(void);
++unsigned long	_sparcv9_random(void);
++size_t 		_sparcv9_vis1_instrument_bus(unsigned int *,size_t);
++size_t		_sparcv8_vis1_instrument_bus2(unsigned int *,size_t,size_t);
+ 
+-#ifndef _BOOT
+ unsigned long OPENSSL_rdtsc(void)
+ 	{
+-	if (OPENSSL_sparcv9cap_P&SPARCV9_TICK_PRIVILEGED)
++	if (OPENSSL_sparcv9cap_P[0]&SPARCV9_TICK_PRIVILEGED)
+ #if defined(__sun) && defined(__SVR4)
+ 		return gethrtime();
+ #else
[email protected]@ -49,19 +48,26 @@
+ 	else
+ 		return _sparcv9_rdtick();
+ 	}
+-#endif
+ 
+-#if defined(_BOOT)
+-/*
+- * Hardcoding sparc capabilities for wanboot.
+- * Older CPUs are EOLed anyway.
+- */
+-void OPENSSL_cpuid_setup(void)
++size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt)
+ 	{
+-	OPENSSL_sparcv9cap_P = SPARCV9_VIS1;
++	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++			SPARCV9_BLK)
++		return _sparcv9_vis1_instrument_bus(out,cnt);
++	else
++		return 0;
+ 	}
+ 
+-#elif 0 && defined(__sun) && defined(__SVR4)
++size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max)
++	{
++	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++			SPARCV9_BLK)
++		return _sparcv9_vis1_instrument_bus2(out,cnt,max);
++	else
++		return 0;
++	}
++
++#if 0 && defined(__sun) && defined(__SVR4)
+ /* This code path is disabled, because of incompatibility of
+  * libdevinfo.so.1 and libmalloc.so.1 (see below for details)
+  */
[email protected]@ -85,11 +91,11 @@
+ 	if (!strcmp (name,"SUNW,UltraSPARC") ||
+ 	    !strncmp(name,"SUNW,UltraSPARC-I",17))  /* covers II,III,IV */
+ 		{
+-		OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1;
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_PREFER_FPU|SPARCV9_VIS1;
+ 
+ 		/* %tick is privileged only on UltraSPARC-I/II, but not IIe */
+ 		if (name[14]!='\0' && name[17]!='\0' && name[18]!='\0')
+-			OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++			OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ 
+ 		return DI_WALK_TERMINATE;
+ 		}
[email protected]@ -96,7 +102,7 @@
+ 	/* This is expected to catch remaining UltraSPARCs, such as T1 */
+ 	else if (!strncmp(name,"SUNW,UltraSPARC",15))
+ 		{
+-		OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++		OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ 
+ 		return DI_WALK_TERMINATE;
+ 		}
[email protected]@ -115,7 +121,7 @@
+ 
+ 	if ((e=getenv("OPENSSL_sparcv9cap")))
+ 		{
+-		OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
++		OPENSSL_sparcv9cap_P[0]=strtoul(e,NULL,0);
+ 		return;
+ 		}
+ 
[email protected]@ -123,17 +129,17 @@
+ 		{
+ 		if (strcmp(si,"sun4v"))
+ 			/* FPU is preferred for all CPUs, but US-T1/2 */
+-			OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_PREFER_FPU;
+ 		}
+ 
+ 	if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
+ 		{
+ 		if (strstr(si,"+vis"))
+-			OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1|SPARCV9_BLK;
+ 		if (strstr(si,"+vis2"))
+ 			{
+-			OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+-			OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
++			OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ 			return;
+ 			}
+ 		}
[email protected]@ -193,12 +199,14 @@
+  
+ 	if ((e=getenv("OPENSSL_sparcv9cap")))
+ 		{
+-		OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
++		OPENSSL_sparcv9cap_P[0]=strtoul(e,NULL,0);
++		if ((e=strchr(e,':')))
++			OPENSSL_sparcv9cap_P[1]=strtoul(e+1,NULL,0);
+ 		return;
+ 		}
+ 
+ 	/* Initial value, fits UltraSPARC-I&II... */
+-	OPENSSL_sparcv9cap_P = SPARCV9_PREFER_FPU|SPARCV9_TICK_PRIVILEGED;
++	OPENSSL_sparcv9cap_P[0] = SPARCV9_PREFER_FPU|SPARCV9_TICK_PRIVILEGED;
+ 
+ 	sigfillset(&all_masked);
+ 	sigdelset(&all_masked,SIGILL);
[email protected]@ -221,20 +229,20 @@
+ 	if (sigsetjmp(common_jmp,1) == 0)
+ 		{
+ 		_sparcv9_rdtick();
+-		OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
++		OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ 		}
+ 
+ 	if (sigsetjmp(common_jmp,1) == 0)
+ 		{
+ 		_sparcv9_vis1_probe();
+-		OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1|SPARCV9_BLK;
+ 		/* detect UltraSPARC-Tx, see sparccpud.S for details... */
+ 		if (_sparcv9_vis1_instrument() >= 12)
+-			OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
++			OPENSSL_sparcv9cap_P[0] &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
+ 		else
+ 			{
+ 			_sparcv9_vis2_probe();
+-			OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
+ 			}
+ 		}
+ 
[email protected]@ -241,9 +249,37 @@
+ 	if (sigsetjmp(common_jmp,1) == 0)
+ 		{
+ 		_sparcv9_fmadd_probe();
+-		OPENSSL_sparcv9cap_P |= SPARCV9_FMADD;
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_FMADD;
+ 		}
+ 
++	/*
++	 * VIS3 flag is tested independently from VIS1, unlike VIS2 that is,
++	 * because VIS3 defines even integer instructions.
++	 */
++	if (sigsetjmp(common_jmp,1) == 0)
++		{
++		_sparcv9_vis3_probe();
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
++		}
++
++	if (sigsetjmp(common_jmp,1) == 0)
++		{
++		(void)_sparcv9_random();
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_RANDOM;
++		}
++
++	/*
++	 * In wait for better solution _sparcv9_rdcfr is masked by
++	 * VIS3 flag, because it goes to uninterruptable endless
++	 * loop on UltraSPARC II running Solaris. Things might be
++	 * different on Linux...
++	 */
++	if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) &&
++	    sigsetjmp(common_jmp,1) == 0)
++		{
++		OPENSSL_sparcv9cap_P[1] = (unsigned int)_sparcv9_rdcfr();
++		}
++
+ 	sigaction(SIGBUS,&bus_oact,NULL);
+ 	sigaction(SIGILL,&ill_oact,NULL);
+ 
+Index: crypto/md5/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/md5/Makefile openssl-1.0.1e/crypto/md5/Makefile
+--- openssl-1.0.1e/crypto/md5/Makefile    2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/md5/Makefile    2011-07-27 10:48:17.817470000 -0700
[email protected]@ -52,6 +52,9 @@
+ 	$(CC) $(CFLAGS) -E asm/md5-ia64.S | \
+ 	$(PERL) -ne 's/;\s+/;\n/g; print;' > [email protected]
+ 
++md5-sparcv9.S:	asm/md5-sparcv9.pl
++	$(PERL) asm/md5-sparcv9.pl [email protected] $(CFLAGS)
++
+ files:
+ 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+ 
+Index: crypto/md5/md5_locl.h
+===================================================================
+diff -ru openssl-1.0.1e/crypto/md5/md5_locl.h openssl-1.0.1e/crypto/md5/md5_locl.h
+--- openssl-1.0.1e/crypto/md5/md5_locl.h    2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/md5/md5_locl.h    2011-07-27 10:48:17.817470000 -0700
[email protected]@ -71,6 +71,8 @@
+ #  define md5_block_data_order md5_block_asm_data_order
+ # elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+ #  define md5_block_data_order md5_block_asm_data_order
++# elif defined(__sparc) || defined(__sparc__)
++#  define md5_block_data_order md5_block_asm_data_order
+ # endif
+ #endif
+
+Index: crypto/sha/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sha/Makefile openssl-1.0.1e/crypto/sha/Makefile
+--- openssl-1.0.1e/crypto/sha/Makefile    2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/sha/Makefile    2011-07-27 10:48:17.817470000 -0700
[email protected]@ -66,9 +66,9 @@
+ sha1-x86_64.s:	asm/sha1-x86_64.pl;	$(PERL) asm/sha1-x86_64.pl $(PERLASM_SCHEME) > [email protected]
+ sha256-x86_64.s:asm/sha512-x86_64.pl;	$(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) [email protected]
+ sha512-x86_64.s:asm/sha512-x86_64.pl;	$(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) [email protected]
+-sha1-sparcv9.s:	asm/sha1-sparcv9.pl;	$(PERL) asm/sha1-sparcv9.pl [email protected] $(CFLAGS)
+-sha256-sparcv9.s:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl [email protected] $(CFLAGS)
+-sha512-sparcv9.s:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl [email protected] $(CFLAGS)
++sha1-sparcv9.S:	asm/sha1-sparcv9.pl;	$(PERL) asm/sha1-sparcv9.pl [email protected] $(CFLAGS)
++sha256-sparcv9.S:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl [email protected] $(CFLAGS)
++sha512-sparcv9.S:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl [email protected] $(CFLAGS)
+ 
+ sha1-ppc.s:	asm/sha1-ppc.pl;	$(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) [email protected]
+ sha256-ppc.s:	asm/sha512-ppc.pl;	$(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) [email protected]
+Index: crypto/sha/asm/sha1-sparcv9.pl
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sha/asm/sha1-sparcv9.pl openssl-1.0.1e/crypto/sha/asm/sha1-sparcv9.pl
+--- openssl-1.0.1e/crypto/sha/asm/sha1-sparcv9.pl 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/sha/asm/sha1-sparcv9.pl 2011-07-27 10:48:17.817470000 -0700
[email protected]@ -5,6 +5,8 @@
+ # project. The module is, however, dual licensed under OpenSSL and
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
+ # details see http://www.openssl.org/~appro/cryptogams/.
++#
++# Hardware SPARC T4 support by David S. Miller <[email protected]>.
+ # ====================================================================
+ 
+ # Performance improvement is not really impressive on pre-T1 CPU: +8%
[email protected]@ -18,6 +20,11 @@
+ # ensure scalability on UltraSPARC T1, or rather to avoid decay when
+ # amount of active threads exceeds the number of physical cores.
+ 
++# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
++# faster than software. Multi-process benchmark saturates at 11x
++# single-process result on 8-core processor, or ~9GBps per 2.85GHz
++# socket.
++
+ $bits=32;
+ for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+ if ($bits==64)	{ $bias=2047; $frame=192; }
[email protected]@ -183,11 +190,93 @@
+ .register	%g3,#scratch
+ ___
+ $code.=<<___;
++#include "sparc_arch.h"
++
+ .section	".text",#alloc,#execinstr
+ 
++#ifdef __PIC__
++SPARC_PIC_THUNK(%g1)
++#endif
++
+ .align	32
+ .globl	sha1_block_data_order
+ sha1_block_data_order:
++	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
++	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
++
++	andcc	%g1, CFR_SHA1, %g0
++	be	.Lsoftware
++	nop
++
++	ld	[%o0 + 0x00], %f0	! load context
++	ld	[%o0 + 0x04], %f1
++	ld	[%o0 + 0x08], %f2
++	andcc	%o1, 0x7, %g0
++	ld	[%o0 + 0x0c], %f3
++	bne,pn	%icc, .Lhwunaligned
++	 ld	[%o0 + 0x10], %f4
++
++.Lhw_loop:
++	ldd	[%o1 + 0x00], %f8
++	ldd	[%o1 + 0x08], %f10
++	ldd	[%o1 + 0x10], %f12
++	ldd	[%o1 + 0x18], %f14
++	ldd	[%o1 + 0x20], %f16
++	ldd	[%o1 + 0x28], %f18
++	ldd	[%o1 + 0x30], %f20
++	subcc	%o2, 1, %o2		! done yet? 
++	ldd	[%o1 + 0x38], %f22
++	add	%o1, 0x40, %o1
++
++	.word	0x81b02820		! SHA1
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhw_loop
++	nop
++
++.Lhwfinish:
++	st	%f0, [%o0 + 0x00]	! store context
++	st	%f1, [%o0 + 0x04]
++	st	%f2, [%o0 + 0x08]
++	st	%f3, [%o0 + 0x0c]
++	retl
++	st	%f4, [%o0 + 0x10]
++
++.align	8
++.Lhwunaligned:
++	alignaddr %o1, %g0, %o1
++
++	ldd	[%o1 + 0x00], %f10
++.Lhwunaligned_loop:
++	ldd	[%o1 + 0x08], %f12
++	ldd	[%o1 + 0x10], %f14
++	ldd	[%o1 + 0x18], %f16
++	ldd	[%o1 + 0x20], %f18
++	ldd	[%o1 + 0x28], %f20
++	ldd	[%o1 + 0x30], %f22
++	ldd	[%o1 + 0x38], %f24
++	subcc	%o2, 1, %o2		! done yet?
++	ldd	[%o1 + 0x40], %f26
++	add	%o1, 0x40, %o1
++
++	faligndata %f10, %f12, %f8
++	faligndata %f12, %f14, %f10
++	faligndata %f14, %f16, %f12
++	faligndata %f16, %f18, %f14
++	faligndata %f18, %f20, %f16
++	faligndata %f20, %f22, %f18
++	faligndata %f22, %f24, %f20
++	faligndata %f24, %f26, %f22
++
++	.word	0x81b02820		! SHA1
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
++	for	%f26, %f26, %f10	! %f10=%f26
++
++	ba	.Lhwfinish
++	nop
++
++.align	16
++.Lsoftware:
+ 	save	%sp,-$frame,%sp
+ 	sllx	$len,6,$len
+ 	add	$inp,$len,$len
[email protected]@ -279,6 +368,62 @@
+ .align	4
+ ___
+ 
+-$code =~ s/\`([^\`]*)\`/eval $1/gem;
+-print $code;
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
++my $ref,$opf;
++my %visopf = (	"faligndata"	=> 0x048,
++		"for"		=> 0x07c	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++sub unalignaddr {
++my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my $ref="$mnemonic\t$rs1,$rs2,$rd";
++
++    foreach ($rs1,$rs2,$rd) {
++	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
++	else			{ return $ref; }
++    }
++    return  sprintf ".word\t0x%08x !%s",
++		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
++		    $ref;
++}
++
++foreach (split("\n",$code)) {
++	s/\`([^\`]*)\`/eval $1/ge;
++
++	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++		&unvis($1,$2,$3,$4)
++	 /ge;
++	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++		&unalignaddr($1,$2,$3,$4)
++	 /ge;
++
++	print $_,"\n";
++}
++
+ close STDOUT;
+
+Index: crypto/sha/asm/sha512-sparcv9.pl
+===================================================================
+diff -ru openssl-1.0.1e/crypto/sha/asm/sha512-sparcv9.pl openssl-1.0.1e/crypto/sha/asm/sha512-sparcv9.pl
+--- openssl-1.0.1e/crypto/sha/asm/sha512-sparcv9.pl 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/sha/asm/sha512-sparcv9.pl 2011-07-27 10:48:17.817470000 -0700
[email protected]@ -5,6 +5,8 @@
+ # project. The module is, however, dual licensed under OpenSSL and
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
+ # details see http://www.openssl.org/~appro/cryptogams/.
++#
++# Hardware SPARC T4 support by David S. Miller <[email protected]>.
+ # ====================================================================
+ 
+ # SHA256 performance improvement over compiler generated code varies
[email protected]@ -41,6 +43,12 @@
+ #	loads are always slower than one 64-bit load. Once again this
+ #	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
+ #	2x32-bit loads can be as fast as 1x64-bit ones.
++#
++# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
++# which is 9.3x/11.1x faster than software. Multi-process benchmark
++# saturates at 11.5x single-process result on 8-core processor, or
++# ~11/16GBps per 2.85GHz socket.
++
+ 
+ $bits=32;
+ for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
[email protected]@ -386,6 +394,8 @@
+ .register	%g3,#scratch
+ ___
+ $code.=<<___;
++#include "sparc_arch.h"
++
+ .section	".text",#alloc,#execinstr
+ 
+ .align	64
[email protected]@ -457,8 +467,196 @@
+ }
+ $code.=<<___;
+ .size	K${label},.-K${label}
++
++#ifdef __PIC__
++SPARC_PIC_THUNK(%g1)
++#endif
++
+ .globl	sha${label}_block_data_order
++.align	32
+ sha${label}_block_data_order:
++	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
++	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
++
++	andcc	%g1, CFR_SHA${label}, %g0
++	be	.Lsoftware
++	nop
++___
++$code.=<<___ if ($SZ==8); 		# SHA512
++	ldd	[%o0 + 0x00], %f0	! load context
++	ldd	[%o0 + 0x08], %f2
++	ldd	[%o0 + 0x10], %f4
++	ldd	[%o0 + 0x18], %f6
++	ldd	[%o0 + 0x20], %f8
++	ldd	[%o0 + 0x28], %f10
++	andcc	%o1, 0x7, %g0
++	ldd	[%o0 + 0x30], %f12
++	bne,pn	%icc, .Lhwunaligned
++	 ldd	[%o0 + 0x38], %f14
++
++.Lhwaligned_loop:
++	ldd	[%o1 + 0x00], %f16
++	ldd	[%o1 + 0x08], %f18
++	ldd	[%o1 + 0x10], %f20
++	ldd	[%o1 + 0x18], %f22
++	ldd	[%o1 + 0x20], %f24
++	ldd	[%o1 + 0x28], %f26
++	ldd	[%o1 + 0x30], %f28
++	ldd	[%o1 + 0x38], %f30
++	ldd	[%o1 + 0x40], %f32
++	ldd	[%o1 + 0x48], %f34
++	ldd	[%o1 + 0x50], %f36
++	ldd	[%o1 + 0x58], %f38
++	ldd	[%o1 + 0x60], %f40
++	ldd	[%o1 + 0x68], %f42
++	ldd	[%o1 + 0x70], %f44
++	subcc	%o2, 1, %o2		! done yet?
++	ldd	[%o1 + 0x78], %f46
++	add	%o1, 0x80, %o1
++
++	.word	0x81b02860		! SHA512
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
++	nop
++
++.Lhwfinish:
++	std	%f0, [%o0 + 0x00]	! store context
++	std	%f2, [%o0 + 0x08]
++	std	%f4, [%o0 + 0x10]
++	std	%f6, [%o0 + 0x18]
++	std	%f8, [%o0 + 0x20]
++	std	%f10, [%o0 + 0x28]
++	std	%f12, [%o0 + 0x30]
++	retl
++	 std	%f14, [%o0 + 0x38]
++
++.align	16
++.Lhwunaligned:
++	alignaddr %o1, %g0, %o1
++
++	ldd	[%o1 + 0x00], %f18
++.Lhwunaligned_loop:
++	ldd	[%o1 + 0x08], %f20
++	ldd	[%o1 + 0x10], %f22
++	ldd	[%o1 + 0x18], %f24
++	ldd	[%o1 + 0x20], %f26
++	ldd	[%o1 + 0x28], %f28
++	ldd	[%o1 + 0x30], %f30
++	ldd	[%o1 + 0x38], %f32
++	ldd	[%o1 + 0x40], %f34
++	ldd	[%o1 + 0x48], %f36
++	ldd	[%o1 + 0x50], %f38
++	ldd	[%o1 + 0x58], %f40
++	ldd	[%o1 + 0x60], %f42
++	ldd	[%o1 + 0x68], %f44
++	ldd	[%o1 + 0x70], %f46
++	ldd	[%o1 + 0x78], %f48
++	subcc	%o2, 1, %o2		! done yet?
++	ldd	[%o1 + 0x80], %f50
++	add	%o1, 0x80, %o1
++
++	faligndata %f18, %f20, %f16
++	faligndata %f20, %f22, %f18
++	faligndata %f22, %f24, %f20
++	faligndata %f24, %f26, %f22
++	faligndata %f26, %f28, %f24
++	faligndata %f28, %f30, %f26
++	faligndata %f30, %f32, %f28
++	faligndata %f32, %f34, %f30
++	faligndata %f34, %f36, %f32
++	faligndata %f36, %f38, %f34
++	faligndata %f38, %f40, %f36
++	faligndata %f40, %f42, %f38
++	faligndata %f42, %f44, %f40
++	faligndata %f44, %f46, %f42
++	faligndata %f46, %f48, %f44
++	faligndata %f48, %f50, %f46
++
++	.word	0x81b02860		! SHA512
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
++	for	%f50, %f50, %f18	! %f18=%f50
++
++	ba	.Lhwfinish
++	nop
++___
++$code.=<<___ if ($SZ==4); 		# SHA256
++	ld	[%o0 + 0x00], %f0
++	ld	[%o0 + 0x04], %f1
++	ld	[%o0 + 0x08], %f2
++	ld	[%o0 + 0x0c], %f3
++	ld	[%o0 + 0x10], %f4
++	ld	[%o0 + 0x14], %f5
++	andcc	%o1, 0x7, %g0
++	ld	[%o0 + 0x18], %f6
++	bne,pn	%icc, .Lhwunaligned
++	 ld	[%o0 + 0x1c], %f7
++
++.Lhwloop:
++	ldd	[%o1 + 0x00], %f8
++	ldd	[%o1 + 0x08], %f10
++	ldd	[%o1 + 0x10], %f12
++	ldd	[%o1 + 0x18], %f14
++	ldd	[%o1 + 0x20], %f16
++	ldd	[%o1 + 0x28], %f18
++	ldd	[%o1 + 0x30], %f20
++	subcc	%o2, 1, %o2		! done yet?
++	ldd	[%o1 + 0x38], %f22
++	add	%o1, 0x40, %o1
++
++	.word	0x81b02840		! SHA256
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwloop
++	nop
++
++.Lhwfinish:
++	st	%f0, [%o0 + 0x00]	! store context
++	st	%f1, [%o0 + 0x04]
++	st	%f2, [%o0 + 0x08]
++	st	%f3, [%o0 + 0x0c]
++	st	%f4, [%o0 + 0x10]
++	st	%f5, [%o0 + 0x14]
++	st	%f6, [%o0 + 0x18]
++	retl
++	 st	%f7, [%o0 + 0x1c]
++
++.align	8
++.Lhwunaligned:
++	alignaddr %o1, %g0, %o1
++
++	ldd	[%o1 + 0x00], %f10
++.Lhwunaligned_loop:
++	ldd	[%o1 + 0x08], %f12
++	ldd	[%o1 + 0x10], %f14
++	ldd	[%o1 + 0x18], %f16
++	ldd	[%o1 + 0x20], %f18
++	ldd	[%o1 + 0x28], %f20
++	ldd	[%o1 + 0x30], %f22
++	ldd	[%o1 + 0x38], %f24
++	subcc	%o2, 1, %o2		! done yet?
++	ldd	[%o1 + 0x40], %f26
++	add	%o1, 0x40, %o1
++
++	faligndata %f10, %f12, %f8
++	faligndata %f12, %f14, %f10
++	faligndata %f14, %f16, %f12
++	faligndata %f16, %f18, %f14
++	faligndata %f18, %f20, %f16
++	faligndata %f20, %f22, %f18
++	faligndata %f22, %f24, %f20
++	faligndata %f24, %f26, %f22
++
++	.word	0x81b02840		! SHA256
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
++	for	%f26, %f26, %f10	! %f10=%f26
++
++	ba	.Lhwfinish
++	nop
++___
++$code.=<<___;
++.align	16
++.Lsoftware:
+ 	save	%sp,`-$frame-$locals`,%sp
+ 	and	$inp,`$align-1`,$tmp31
+ 	sllx	$len,`log(16*$SZ)/log(2)`,$len
[email protected]@ -589,6 +787,62 @@
+ .align	4
+ ___
+ 
+-$code =~ s/\`([^\`]*)\`/eval $1/gem;
+-print $code;
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
++my $ref,$opf;
++my %visopf = (	"faligndata"	=> 0x048,
++		"for"		=> 0x07c	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++sub unalignaddr {
++my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my $ref="$mnemonic\t$rs1,$rs2,$rd";
++
++    foreach ($rs1,$rs2,$rd) {
++	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
++	else			{ return $ref; }
++    }
++    return  sprintf ".word\t0x%08x !%s",
++		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
++		    $ref;
++}
++
++foreach (split("\n",$code)) {
++	s/\`([^\`]*)\`/eval $1/ge;
++
++	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++		&unvis($1,$2,$3,$4)
++	 /ge;
++	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++		&unalignaddr($1,$2,$3,$4)
++	 /ge;
++
++	print $_,"\n";
++}
++
+ close STDOUT;
+Index: openssl/apps/speed.c
+===================================================================
+diff -ru openssl-1.0.1e/apps/spped.c openssl-1.0.1e/apps/speed.c
+--- openssl-1.0.1e/apps/speed.c 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/apps/spped.c 2011-07-27 10:48:17.817470000 -0700
[email protected]@ -1551,7 +1551,7 @@
+ 			print_message(names[D_MD5],c[D_MD5][j],lengths[j]);
+ 			Time_F(START);
+ 			for (count=0,run=1; COND(c[D_MD5][j]); count++)
+-				EVP_Digest(&(buf[0]),(unsigned long)lengths[j],&(md5[0]),NULL,EVP_get_digestbyname("md5"),NULL);
++				MD5(buf,lengths[j],md5);
+ 			d=Time_F(STOP);
+ 			print_result(D_MD5,j,count,d);
+ 			}
[email protected]@ -1591,7 +1591,7 @@
+ 			print_message(names[D_SHA1],c[D_SHA1][j],lengths[j]);
+ 			Time_F(START);
+ 			for (count=0,run=1; COND(c[D_SHA1][j]); count++)
+-				EVP_Digest(buf,(unsigned long)lengths[j],&(sha[0]),NULL,EVP_sha1(),NULL);
++				SHA1(buf,lengths[j],sha);
+ 			d=Time_F(STOP);
+ 			print_result(D_SHA1,j,count,d);
+ 			}