components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch
changeset 1382 3515c1afdfc8
parent 1373 2fd83bee8884
child 1401 367855861774
--- a/components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch	Mon Jul 08 16:18:46 2013 -0700
+++ b/components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch	Mon Jul 08 17:50:18 2013 -0700
@@ -11,7 +11,7 @@
  my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:";
  my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
 -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
-+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
  my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
  my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
  my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
@@ -20,19 +20,50 @@
 diff -ru openssl-1.0.1e/crypto/sparccpuid.S openssl-1.0.1e/crypto/sparccpuid.S
 --- openssl-1.0.1e/crypto/sparccpuid.S 2011-05-24 17:02:24.000000000 -0700
 +++ openssl-1.0.1e/crypto/sparccpuid.S 2011-07-27 10:48:17.817470000 -0700
-@@ -251,6 +251,11 @@
+@@ -1,3 +1,7 @@
++#ifdef OPENSSL_FIPSCANISTER
++#include <openssl/fipssyms.h>
++#endif
++
+ #if defined(__SUNPRO_C) && defined(__sparcv9)
+ # define ABI64  /* They've said -xarch=v9 at command line */
+ #elif defined(__GNUC__) && defined(__arch64__)
+@@ -123,7 +127,7 @@
+ 			fmovs	%f1,%f3
+ 			fmovs	%f0,%f2
+ 
+-	add	%fp,BIAS,%i0	! return pointer to caller�s top of stack
++	add	%fp,BIAS,%i0	! return pointer to caller?s top of stack
+ 
+ 	ret
+ 	restore
+@@ -235,10 +239,10 @@
+ .global	_sparcv9_vis1_probe
+ .align	8
+ _sparcv9_vis1_probe:
++	.word	0x81b00d80	!fxor	%f0,%f0,%f0
+ 	add	%sp,BIAS+2,%o1
+-	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
+ 	retl
+-	.word	0x81b00d80	!fxor	%f0,%f0,%f0
++	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
+ .type	_sparcv9_vis1_probe,#function
+ .size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
+ 
+@@ -251,7 +255,12 @@
  !	UltraSPARC IIe		7
  !	UltraSPARC III		7
  !	UltraSPARC T1		24
 +!	SPARC T4		65(*)
-+!
+ !
 +! (*)	result has lesser to do with VIS instruction latencies, rdtick
 +!	appears that slow, but it does the trick in sense that FP and
 +!	VIS code paths are still slower than integer-only ones.
- !
++!
  ! Numbers for T2 and SPARC64 V-VII are more than welcomed.
  !
-@@ -260,6 +265,8 @@
+ ! It would be possible to detect specifically US-T1 by instrumenting
+@@ -260,6 +269,8 @@
  .global	_sparcv9_vis1_instrument
  .align	8
  _sparcv9_vis1_instrument:
@@ -41,7 +72,7 @@
  	.word	0x91410000	!rd	%tick,%o0
  	.word	0x81b00d80	!fxor	%f0,%f0,%f0
  	.word	0x85b08d82	!fxor	%f2,%f2,%f2
-@@ -314,6 +321,30 @@
+@@ -314,6 +325,30 @@
  .type	_sparcv9_fmadd_probe,#function
  .size	_sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
  
@@ -72,14 +103,125 @@
  .global	OPENSSL_cleanse
  .align	32
  OPENSSL_cleanse:
+@@ -397,11 +432,102 @@
+ .type	OPENSSL_cleanse,#function
+ .size	OPENSSL_cleanse,.-OPENSSL_cleanse
+ 
+-#ifndef _BOOT
++.global	_sparcv9_vis1_instrument_bus
++.align	8
++_sparcv9_vis1_instrument_bus:
++	mov	%o1,%o3					! save cnt
++	.word	0x99410000	!rd	%tick,%o4	! tick
++	mov	%o4,%o5					! lasttick = tick
++	set	0,%g4					! diff
++
++	andn	%o0,63,%g1
++	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
++	.word	0x8143e040	!membar	#Sync
++	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
++	.word	0x8143e040	!membar	#Sync
++	ld	[%o0],%o4
++	add	%o4,%g4,%g4
++	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
++
++.Loop:	.word	0x99410000	!rd	%tick,%o4
++	sub	%o4,%o5,%g4				! diff=tick-lasttick
++	mov	%o4,%o5					! lasttick=tick
++
++	andn	%o0,63,%g1
++	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
++	.word	0x8143e040	!membar	#Sync
++	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
++	.word	0x8143e040	!membar	#Sync
++	ld	[%o0],%o4
++	add	%o4,%g4,%g4
++	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
++	subcc	%o1,1,%o1				! --$cnt
++	bnz	.Loop
++	add	%o0,4,%o0				! ++$out
++
++	retl
++	mov	%o3,%o0
++.type	_sparcv9_vis1_instrument_bus,#function
++.size	_sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
++
++.global	_sparcv9_vis1_instrument_bus2
++.align	8
++_sparcv9_vis1_instrument_bus2:
++	mov	%o1,%o3					! save cnt
++	sll	%o1,2,%o1				! cnt*=4
++
++	.word	0x99410000	!rd	%tick,%o4	! tick
++	mov	%o4,%o5					! lasttick = tick
++	set	0,%g4					! diff
++
++	andn	%o0,63,%g1
++	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
++	.word	0x8143e040	!membar	#Sync
++	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
++	.word	0x8143e040	!membar	#Sync
++	ld	[%o0],%o4
++	add	%o4,%g4,%g4
++	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
++
++	.word	0x99410000	!rd	%tick,%o4	! tick
++	sub	%o4,%o5,%g4				! diff=tick-lasttick
++	mov	%o4,%o5					! lasttick=tick
++	mov	%g4,%g5					! lastdiff=diff
++.Loop2:
++	andn	%o0,63,%g1
++	.word	0xc1985e00	!ldda	[%g1]0xf0,%f0	! block load
++	.word	0x8143e040	!membar	#Sync
++	.word	0xc1b85c00	!stda	%f0,[%g1]0xe0	! block store and commit
++	.word	0x8143e040	!membar	#Sync
++	ld	[%o0],%o4
++	add	%o4,%g4,%g4
++	.word	0xc9e2100c	!cas	[%o0],%o4,%g4
++
++	subcc	%o2,1,%o2				! --max
++	bz	.Ldone2
++	nop
++
++	.word	0x99410000	!rd	%tick,%o4	! tick
++	sub	%o4,%o5,%g4				! diff=tick-lasttick
++	mov	%o4,%o5					! lasttick=tick
++	cmp	%g4,%g5
++	mov	%g4,%g5					! lastdiff=diff
++
++	.word	0x83408000	!rd	%ccr,%g1
++	and	%g1,4,%g1				! isolate zero flag
++	xor	%g1,4,%g1				! flip zero flag
++
++	subcc	%o1,%g1,%o1				! conditional --$cnt
++	bnz	.Loop2
++	add	%o0,%g1,%o0				! conditional ++$out
++
++.Ldone2:
++	srl	%o1,2,%o1
++	retl
++	sub	%o3,%o1,%o0
++.type	_sparcv9_vis1_instrument_bus2,#function
++.size	_sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
++
+ .section	".init",#alloc,#execinstr
+ 	call	OPENSSL_cpuid_setup
+ 	nop
+-#else
+-	nop
+-	nop
+-#endif
 
 Index: crypto/sparcv9cap.c
 ===================================================================
 diff -ru openssl-1.0.1e/crypto/sparcv9cap.c openssl-1.0.1e/crypto/sparcv9cap.c
 --- openssl-1.0.1e/crypto/sparcv9cap.c 2011-05-24 17:02:24.000000000 -0700
 +++ openssl-1.0.1e/crypto/sparcv9cap.c 2011-07-27 10:48:17.817470000 -0700
-@@ -6,16 +6,15 @@
+@@ -4,31 +4,55 @@
+ #include <setjmp.h>
+ #include <signal.h>
  #include <sys/time.h>
++#include <unistd.h>
  #include <openssl/bn.h>
  
 -#define SPARCV9_TICK_PRIVILEGED	(1<<0)
@@ -101,16 +243,50 @@
  #endif
  
  int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
-@@ -24,7 +23,7 @@
+ 	{
++	int bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+ 	int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
  	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
  
- 	if (num>=8 && !(num&1) &&
+-	if (num>=8 && !(num&1) &&
 -	    (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
-+	    (OPENSSL_sparcv9cap_P[0]&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
- 		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
- 		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
- 	else
-@@ -36,11 +35,16 @@
+-		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
+-		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
+-	else
+-		return bn_mul_mont_int(rp,ap,bp,np,n0,num);
++	if (!(num&1) && num>=6)
++		{
++		if ((num&15)==0 && num<=64 &&
++		    (OPENSSL_sparcv9cap_P[1]&(CFR_MONTMUL|CFR_MONTSQR))== 
++		    			     (CFR_MONTMUL|CFR_MONTSQR))
++			{
++			typedef int (*bn_mul_mont_f)(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0);
++			int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0);
++			int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0);
++			int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0);
++			int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0);
++			static const bn_mul_mont_f funcs[4] = {
++				bn_mul_mont_t4_8,	bn_mul_mont_t4_16,
++				bn_mul_mont_t4_24,	bn_mul_mont_t4_32 };
++			bn_mul_mont_f worker = funcs[num/16-1];
++
++			if ((*worker)(rp,ap,bp,np,n0)) return 1;
++			/* retry once and fall back */
++			if ((*worker)(rp,ap,bp,np,n0)) return 1;
++			return bn_mul_mont_vis3(rp,ap,bp,np,n0,num);
++			}
++		if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3))
++			return bn_mul_mont_vis3(rp,ap,bp,np,n0,num);
++		else if (num>=8 &&
++			(OPENSSL_sparcv9cap_P[0]&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
++			(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
++			return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
++		}
++	return bn_mul_mont_int(rp,ap,bp,np,n0,num);
+ 	}
+ 
+ unsigned long	_sparcv9_rdtick(void);
+@@ -36,11 +60,16 @@
  unsigned long	_sparcv9_vis1_instrument(void);
  void		_sparcv9_vis2_probe(void);
  void		_sparcv9_fmadd_probe(void);
@@ -128,13 +304,13 @@
  #if defined(__sun) && defined(__SVR4)
  		return gethrtime();
  #else
-@@ -51,6 +55,25 @@
+@@ -51,6 +81,25 @@
  	}
  #endif
  
 +size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt)
 +	{
-+	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++	if ((OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK)) ==
 +			SPARCV9_BLK)
 +		return _sparcv9_vis1_instrument_bus(out,cnt);
 +	else
@@ -143,7 +319,7 @@
 +
 +size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max)
 +	{
-+	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
++	if ((OPENSSL_sparcv9cap_P[0]&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK)) ==
 +			SPARCV9_BLK)
 +		return _sparcv9_vis1_instrument_bus2(out,cnt,max);
 +	else
@@ -154,7 +330,7 @@
  #if defined(_BOOT)
  /*
   * Hardcoding sparc capabilities for wanboot.
-@@ -58,7 +81,7 @@
+@@ -58,7 +106,7 @@
   */
  void OPENSSL_cpuid_setup(void)
  	{
@@ -163,7 +339,7 @@
  	}
  
  #elif 0 && defined(__sun) && defined(__SVR4)
-@@ -85,11 +108,11 @@
+@@ -85,11 +116,11 @@
  	if (!strcmp (name,"SUNW,UltraSPARC") ||
  	    !strncmp(name,"SUNW,UltraSPARC-I",17))  /* covers II,III,IV */
  		{
@@ -177,7 +353,7 @@
  
  		return DI_WALK_TERMINATE;
  		}
-@@ -96,7 +119,7 @@
+@@ -96,7 +127,7 @@
  	/* This is expected to catch remaining UltraSPARCs, such as T1 */
  	else if (!strncmp(name,"SUNW,UltraSPARC",15))
  		{
@@ -186,7 +362,7 @@
  
  		return DI_WALK_TERMINATE;
  		}
-@@ -115,7 +138,7 @@
+@@ -115,7 +146,7 @@
  
  	if ((e=getenv("OPENSSL_sparcv9cap")))
  		{
@@ -195,7 +371,7 @@
  		return;
  		}
  
-@@ -123,17 +146,17 @@
+@@ -123,17 +154,17 @@
  		{
  		if (strcmp(si,"sun4v"))
  			/* FPU is preferred for all CPUs, but US-T1/2 */
@@ -217,7 +393,7 @@
  			return;
  			}
  		}
-@@ -193,12 +216,14 @@
+@@ -193,12 +224,14 @@
   
  	if ((e=getenv("OPENSSL_sparcv9cap")))
  		{
@@ -234,7 +410,7 @@
  
  	sigfillset(&all_masked);
  	sigdelset(&all_masked,SIGILL);
-@@ -221,20 +246,20 @@
+@@ -221,20 +254,20 @@
  	if (sigsetjmp(common_jmp,1) == 0)
  		{
  		_sparcv9_rdtick();
@@ -259,7 +435,7 @@
  			}
  		}
  
-@@ -241,9 +266,37 @@
+@@ -241,13 +274,53 @@
  	if (sigsetjmp(common_jmp,1) == 0)
  		{
  		_sparcv9_fmadd_probe();
@@ -298,6 +474,22 @@
  	sigaction(SIGBUS,&bus_oact,NULL);
  	sigaction(SIGILL,&ill_oact,NULL);
  
+ 	sigprocmask(SIG_SETMASK,&oset,NULL);
++
++	if (sizeof(size_t)==8)
++		OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
++#ifdef __linux
++	else
++		{
++		int ret = syscall(340);
++
++		if (ret>=0 && ret&1)
++			OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
++		}
++#endif
+ 	}
+ 
+ #endif
 Index: crypto/md5/Makefile
 ===================================================================
 diff -ru openssl-1.0.1e/crypto/md5/Makefile openssl-1.0.1e/crypto/md5/Makefile
@@ -1163,6 +1355,292 @@
  #endif
  	return 1;
  	}
+Index: openssl/crypto/bn/Makefile
+===================================================================
+diff -ru openssl-1.0.1e/crypto/bn/Makefile openssl-1.0.1e/crypto/bn/Makefile.new
+--- openssl-1.0.1e/crypto/bn/Makefile 2011-05-24 17:02:24.000000000 -0700
++++ openssl-1.0.1e/crypto/bn/Makefile 2011-07-27 10:48:17.817470000 -0700
+@@ -77,6 +77,12 @@
+ 	$(PERL) asm/sparcv9a-mont.pl $(CFLAGS) > $@
+ sparcv9-mont.s:		asm/sparcv9-mont.pl
+ 	$(PERL) asm/sparcv9-mont.pl $(CFLAGS) > $@
++vis3-mont.s:		asm/vis3-mont.pl
++	$(PERL) asm/vis3-mont.pl $(CFLAGS) > $@
++sparct4-mont.S:	asm/sparct4-mont.pl
++	$(PERL) asm/sparct4-mont.pl $(CFLAGS) > $@
++sparcv9-gf2m.S:	asm/sparcv9-gf2m.pl
++	$(PERL) asm/sparcv9-gf2m.pl $(CFLAGS) > $@
+ 
+ bn-mips3.o:	asm/mips3.s
+ 	@if [ "$(CC)" = "gcc" ]; then \
+Index: openssl/crypto/bn/bn_exp.c
+===================================================================
+diff -ru openssl-1.0.1e/crypto/bn/bn_exp.c openssl-1.0.1e/crypto/bn/bn_exp.c.new
+--- bn_exp.c	2011/10/29 19:25:13	1.38
++++ bn_exp.c	2012/11/17 10:34:11	1.39
+@@ -123,8 +123,15 @@
+ # ifndef alloca
+ #  define alloca(s) __builtin_alloca((s))
+ # endif
++#else
++#include <alloca.h>
+ #endif
+ 
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++# include "sparc_arch.h"
++extern unsigned int OPENSSL_sparcv9cap_P[];
++#endif
++
+ /* maximum precomputation table size for *variable* sliding windows */
+ #define TABLE_SIZE	32
+ 
+@@ -467,7 +467,15 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ 	wstart=bits-1;	/* The top bit of the window */
+ 	wend=0;		/* The bottom bit of the window */
+ 
++#if 1	/* by Shay Gueron's suggestion */
++	j = mont->N.top;	/* borrow j */
++	if (bn_wexpand(r,j) == NULL) goto err;
++	r->d[0] = (0-m->d[0])&BN_MASK2;		/* 2^(top*BN_BITS2) - m */
++	for(i=1;i<j;i++) r->d[i] = (~m->d[i])&BN_MASK2;
++	r->top = j;
++#else
+ 	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
++#endif
+ 	for (;;)
+ 		{
+ 		if (BN_is_bit_set(p,wstart) == 0)
+@@ -519,6 +527,17 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ 		start=0;
+ 		if (wstart < 0) break;
+ 		}
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_VIS3|SPARCV9_PREFER_FPU))
++ 	{
++ 		j = mont->N.top;	/* borrow j */
++ 		val[0]->d[0] = 1;	/* borrow val[0] */
++ 		for (i=1;i<j;i++) val[0]->d[i] = 0;
++ 		val[0]->top = j;
++ 		if (!BN_mod_mul_montgomery(rr,r,val[0],mont,ctx)) goto err;
++ 		}
++ 	else
++#endif
+ 	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+ 	ret=1;
+ err:
+@@ -528,6 +547,28 @@ err:
+ 	return(ret);
+ 	}
+ 
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
++	{
++	BN_ULONG ret=0;
++	int wordpos;
++
++	wordpos = bitpos/BN_BITS2;
++	bitpos %= BN_BITS2;
++	if (wordpos>=0 && wordpos < a->top)
++		{
++		ret = a->d[wordpos]&BN_MASK2;
++		if (bitpos)
++			{
++			ret >>= bitpos;
++			if (++wordpos < a->top)
++				ret |= a->d[wordpos]<<(BN_BITS2-bitpos);
++			}
++		}
++
++	return ret&BN_MASK2;
++}
++#endif
+ 
+ /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
+  * so that accessing any of these table values shows the same access pattern as far
+@@ -587,6 +592,9 @@
+ 	int powerbufLen = 0;
+ 	unsigned char *powerbuf=NULL;
+ 	BIGNUM tmp, am;
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++	unsigned int t4=0;
++#endif
+ 
+ 	bn_check_top(a);
+ 	bn_check_top(p);
+@@ -621,9 +629,18 @@
+ 
+ 	/* Get the window size to use with size of p. */
+ 	window = BN_window_bits_for_ctime_exponent_size(bits);
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++	if (window>=5 && (top&15)==0 && top<=64 &&
++	    (OPENSSL_sparcv9cap_P[1]&(CFR_MONTMUL|CFR_MONTSQR))==
++	    			     (CFR_MONTMUL|CFR_MONTSQR) &&
++	    (t4=OPENSSL_sparcv9cap_P[0]))
++		window=5;
++	else
++#endif
+ #if defined(OPENSSL_BN_ASM_MONT5)
+ 	if (window==6 && bits<=1024) window=5;	/* ~5% improvement of 2048-bit RSA sign */
+ #endif
++	(void)0;
+ 
+ 	/* Allocate a buffer large enough to hold all of the pre-computed
+ 	 * powers of am, am itself and tmp.
+@@ -656,13 +715,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ 	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+ 
+ 	/* prepare a^0 in Montgomery domain */
+-#if 1
+- 	if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))	goto err;
+-#else
++#if 1	/* by Shay Gueron's suggestion */
+ 	tmp.d[0] = (0-m->d[0])&BN_MASK2;	/* 2^(top*BN_BITS2) - m */
+ 	for (i=1;i<top;i++)
+ 		tmp.d[i] = (~m->d[i])&BN_MASK2;
+ 	tmp.top = top;
++#else
++	if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))	goto err;
+ #endif
+ 
+ 	/* prepare a^1 in Montgomery domain */
+@@ -673,6 +690,121 @@
+ 		}
+ 	else	if (!BN_to_montgomery(&am,a,mont,ctx))		goto err;
+ 
++#if defined(OPENSSL_BN_ASM_MONT) && defined(__sparc)
++    if (t4)
++	{
++	typedef int (*bn_pwr5_mont_f)(BN_ULONG *tp,const BN_ULONG *np,
++			const BN_ULONG *n0,const void *table,int power,int bits);
++	int bn_pwr5_mont_t4_8(BN_ULONG *tp,const BN_ULONG *np,
++			const BN_ULONG *n0,const void *table,int power,int bits);
++	int bn_pwr5_mont_t4_16(BN_ULONG *tp,const BN_ULONG *np,
++			const BN_ULONG *n0,const void *table,int power,int bits);
++	int bn_pwr5_mont_t4_24(BN_ULONG *tp,const BN_ULONG *np,
++			const BN_ULONG *n0,const void *table,int power,int bits);
++	int bn_pwr5_mont_t4_32(BN_ULONG *tp,const BN_ULONG *np,
++			const BN_ULONG *n0,const void *table,int power,int bits);
++	static const bn_pwr5_mont_f pwr5_funcs[4] = {
++			bn_pwr5_mont_t4_8,	bn_pwr5_mont_t4_16,
++			bn_pwr5_mont_t4_24,	bn_pwr5_mont_t4_32 };
++	bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top/16-1];
++
++	typedef int (*bn_mul_mont_f)(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++	int bn_mul_mont_t4_8(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++	int bn_mul_mont_t4_16(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++	int bn_mul_mont_t4_24(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++	int bn_mul_mont_t4_32(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *bp,const BN_ULONG *np,const BN_ULONG *n0);
++	static const bn_mul_mont_f mul_funcs[4] = {
++			bn_mul_mont_t4_8,	bn_mul_mont_t4_16,
++			bn_mul_mont_t4_24,	bn_mul_mont_t4_32 };
++	bn_mul_mont_f mul_worker = mul_funcs[top/16-1];
++
++	void bn_mul_mont_vis3(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *bp,const BN_ULONG *np,
++			const BN_ULONG *n0,int num);
++	void bn_mul_mont_t4(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *bp,const BN_ULONG *np,
++			const BN_ULONG *n0,int num);
++	void bn_mul_mont_gather5_t4(BN_ULONG *rp,const BN_ULONG *ap,
++			const void *table,const BN_ULONG *np,
++			const BN_ULONG *n0,int num,int power);
++	void bn_flip_n_scatter5_t4(const BN_ULONG *inp,size_t num,
++			void *table,size_t power);
++	void bn_gather5_t4(BN_ULONG *out,size_t num,
++			void *table,size_t power);
++	void bn_flip_t4(BN_ULONG *dst,BN_ULONG *src,size_t num);
++
++	BN_ULONG *np=mont->N.d, *n0=mont->n0;
++	int stride = 5*(6-(top/16-1));	/* multiple of 5, but less than 32 */
++
++	/* BN_to_montgomery can contaminate words above .top
++	 * [in BN_DEBUG[_DEBUG] build]... */
++	for (i=am.top; i<top; i++)	am.d[i]=0;
++	for (i=tmp.top; i<top; i++)	tmp.d[i]=0;
++
++	bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,0);
++	bn_flip_n_scatter5_t4(am.d,top,powerbuf,1);
++	if (!(*mul_worker)(tmp.d,am.d,am.d,np,n0) &&
++	    !(*mul_worker)(tmp.d,am.d,am.d,np,n0))
++		bn_mul_mont_vis3(tmp.d,am.d,am.d,np,n0,top);
++	bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,2);
++
++	for (i=3; i<32; i++)
++		{
++		/* Calculate a^i = a^(i-1) * a */
++		if (!(*mul_worker)(tmp.d,tmp.d,am.d,np,n0) &&
++		    !(*mul_worker)(tmp.d,tmp.d,am.d,np,n0))
++			bn_mul_mont_vis3(tmp.d,tmp.d,am.d,np,n0,top);
++		bn_flip_n_scatter5_t4(tmp.d,top,powerbuf,i);
++		}
++
++	/* switch to 64-bit domain */ 
++	np = alloca(top*sizeof(BN_ULONG));
++	top /= 2;
++	bn_flip_t4(np,mont->N.d,top);
++
++	bits--;
++	for (wvalue=0, i=bits%5; i>=0; i--,bits--)
++		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
++	bn_gather5_t4(tmp.d,top,powerbuf,wvalue);
++
++	/* Scan the exponent one window at a time starting from the most
++	 * significant bits.
++	 */
++	while (bits >= 0)
++		{
++		if (bits < stride) stride = bits+1;
++		bits -= stride;
++		wvalue = (bn_get_bits(p,bits+1));
++
++		if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
++		/* retry once and fall back */
++		if ((*pwr5_worker)(tmp.d,np,n0,powerbuf,wvalue,stride)) continue;
++
++		bits += stride-5;
++		wvalue >>= stride-5;
++		wvalue &= 31;
++		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++		bn_mul_mont_t4(tmp.d,tmp.d,tmp.d,np,n0,top);
++		bn_mul_mont_gather5_t4(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
++		}
++
++	bn_flip_t4(tmp.d,tmp.d,top);
++	top *= 2;
++	/* back to 32-bit domain */
++	tmp.top=top;
++	bn_correct_top(&tmp);
++	OPENSSL_cleanse(np,top*sizeof(BN_ULONG));
++	}
++    else
++#endif
+ #if defined(OPENSSL_BN_ASM_MONT5)
+     /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+      * specifically optimization of cache-timing attack countermeasures
+@@ -816,6 +990,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ 	}
+ 
+  	/* Convert the final result from montgomery to standard format */
++#if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
++	if (OPENSSL_sparcv9cap_P[0]&(SPARCV9_VIS3|SPARCV9_PREFER_FPU))
++		{
++		am.d[0] = 1;	/* borrow am */
++		for (i=1;i<top;i++) am.d[i] = 0;
++		if (!BN_mod_mul_montgomery(rr,&tmp,&am,mont,ctx)) goto err;
++		}
++	else
++#endif
+ 	if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
+ 	ret=1;
+ err:
 Index: openssl/apps/speed.c
 ===================================================================
 diff -ru openssl-1.0.1e/apps/spped.c openssl-1.0.1e/apps/speed.c