PSARC 2013/034 OpenSSL 1.0.1
15824598 SUNBT7206150 T4 AES should be embedded in the OpenSSL upstream src
--- a/components/openssl/openssl-1.0.1/Makefile Wed Jul 17 00:17:02 2013 -0700
+++ b/components/openssl/openssl-1.0.1/Makefile Wed Jul 17 15:19:38 2013 -0700
@@ -181,7 +181,9 @@
$(LN) -fs $(COMPONENT_DIR)/engines/t4/t4_sha?.S $(@D)/crypto/sha/asm; \
$(LN) -fs $(COMPONENT_DIR)/wanboot-openssl/wanboot-stubs.c $(@D)/crypto; \
$(LN) -fs $(COMPONENT_DIR)/inline-t4/sparc_arch.h $(@D)/crypto/; \
- $(LN) -fs $(COMPONENT_DIR)/inline-t4/md5-sparcv9.pl $(@D)/crypto/md5/asm; )
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/md5-sparcv9.pl $(@D)/crypto/md5/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/aest4-sparcv9.pl $(@D)/crypto/aes/asm; \
+ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparcv9_modes.pl $(@D)/crypto/perlasm; )
# OpenSSL for wanboot is built on sparc only.
@@ -194,7 +196,8 @@
# Object files for wanboot-openssl.o have to be listed explicitly.
WANBOOT_OBJS = \
crypto/aes/aes-sparcv9.o crypto/aes/aes_cbc.o crypto/aes/aes_core.o \
- crypto/aes/aes_misc.o crypto/aes/aes_wrap.o crypto/asn1/a_bitstr.o \
+ crypto/aes/aes_misc.o crypto/aes/aes_wrap.o crypto/aes/aest4-sparcv9.o \
+ crypto/asn1/a_bitstr.o \
crypto/asn1/a_bool.o crypto/asn1/a_bytes.o crypto/asn1/a_d2i_fp.o \
crypto/asn1/a_digest.o crypto/asn1/a_dup.o crypto/asn1/a_enum.o \
crypto/asn1/a_gentm.o crypto/asn1/a_i2d_fp.o crypto/asn1/a_int.o \
--- a/components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c Wed Jul 17 00:17:02 2013 -0700
+++ b/components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c Wed Jul 17 15:19:38 2013 -0700
@@ -331,8 +331,12 @@
#ifdef SOLARIS_HW_SLOT_SELECTION
static int check_hw_mechanisms(void);
static int nid_in_table(int nid, int *nid_table);
-static int hw_aes_instruction_set_present(void);
+#if defined(__amd64) || defined(__i386)
+static int hw_x86_aes_instruction_set_present(void);
+#endif
#if defined(__sparc)
+static int hw_yf_aes_instruction_set_present(void);
+static int hw_fj_aes_instruction_set_present(void);
static int hw_yf_digest_instruction_present(void);
#endif
#endif /* SOLARIS_HW_SLOT_SELECTION */
@@ -2650,6 +2654,30 @@
if (!cipher)
return (pk11_usable_ciphers(nids));
+#ifdef __sparc
+ /*
+ * If T4 AES instructions are present, don't advertise
+ * the AES mechanisms for pkcs11 engine as AES operations
+ * should be accelerated by the inline T4 instructions
+ * in the OpenSSL upstream code.
+ */
+ if (hw_yf_aes_instruction_set_present() == 1) {
+ switch (nid) {
+ case NID_aes_128_cbc:
+ case NID_aes_192_cbc:
+ case NID_aes_256_cbc:
+ case NID_aes_128_ecb:
+ case NID_aes_192_ecb:
+ case NID_aes_256_ecb:
+ case NID_aes_128_ctr:
+ case NID_aes_192_ctr:
+ case NID_aes_256_ctr:
+ *cipher = NULL;
+ return (0);
+ }
+ }
+#endif
+
switch (nid)
{
case NID_des_ede3_cbc:
@@ -3487,6 +3515,21 @@
for (i = 0; i < PK11_CIPHER_MAX; ++i)
{
+#ifdef __sparc
+ /*
+ * if T4 AES instruction is present, don't include AES mechanism
+ * in the supported symmetric cipher list.
+ */
+ if (hw_yf_aes_instruction_set_present() == 1) {
+ switch (ciphers[i].mech_type) {
+ case CKM_AES_CBC:
+ case CKM_AES_ECB:
+ case CKM_AES_CTR:
+ continue;
+ }
+ }
+#endif
+
pk11_get_symmetric_cipher(pflist, current_slot,
current_slot_n_cipher, local_cipher_nids, &ciphers[i]);
}
@@ -3738,19 +3781,14 @@
if (nid_table == NULL)
return (1);
+#if defined(__x86)
/*
- * If we have an AES instruction set on SPARC we route everything
- * through the Crypto Framework (ie., through pkcs11_softtoken in this
- * case). This is for T4 which has HW instructions for AES, DES, MD5,
- * SHA1, SHA256, SHA512, MONTMUL, and MPMUL.
- *
* On Intel, if we have AES-NI instruction set we route AES to the
* Crypto Framework. Intel CPUs do not have other instruction sets for
* HW crypto acceleration so we check the HW NID table for any other
* mechanism.
*/
-#if defined(__x86)
- if (hw_aes_instruction_set_present() == 1)
+ if (hw_x86_aes_instruction_set_present() == 1)
{
switch (nid)
{
@@ -3760,21 +3798,24 @@
case NID_aes_128_cbc:
case NID_aes_192_cbc:
case NID_aes_256_cbc:
- return (1);
- }
- /*
- * These are variables, cannot be used as case expressions.
- */
- if (nid == NID_aes_128_ctr ||
- nid == NID_aes_192_ctr ||
- nid == NID_aes_256_ctr)
- {
+ case NID_aes_128_ctr:
+ case NID_aes_192_ctr:
+ case NID_aes_256_ctr:
return (1);
}
}
#elif defined(__sparc)
- if (hw_aes_instruction_set_present() == 1)
+ /*
+ * If we have a T4 AES instruction set on SPARC, we won't process AES in
+ * the Crypto Framework so that the job can be process directly using
+ * the inline AES instruction. This is for T4 which has HW instructions
+ * for AES, DES, MD5, SHA1, SHA256, SHA512, MONTMUL, and MPMUL.
+ */
+ if (hw_yf_aes_instruction_set_present() == 1) {
+ return (0);
+ } else if (hw_fj_aes_instruction_set_present() == 1) {
return (1);
+ }
#endif
/* The table is never full, there is always at least one NID_undef. */
@@ -3790,29 +3831,54 @@
return (0);
}
+#if defined(__amd64) || defined(__i386)
/* Do we have an AES instruction set? */
static int
-hw_aes_instruction_set_present(void)
+hw_x86_aes_instruction_set_present(void)
{
static int present = -1;
if (present == -1)
{
uint_t ui = 0;
-
(void) getisax(&ui, 1);
-
-#if defined(__amd64) || defined(__i386)
present = (ui & AV_386_AES) > 0;
-#elif defined(__sparc)
- present = (ui & (AV_SPARC_AES|AV_SPARC_FJAES)) > 0;
-#endif
}
return (present);
}
+#endif
#if defined(__sparc)
+
+static int
+hw_yf_aes_instruction_set_present(void)
+ {
+ static int present = -1;
+ if (present == -1)
+ {
+ uint_t ui = 0;
+ (void) getisax(&ui, 1);
+ present = (ui & (AV_SPARC_AES)) > 0;
+ }
+ return (present);
+ }
+
+/* Do we have a Fujitsu AES instruction set? */
+static int
+hw_fj_aes_instruction_set_present(void)
+ {
+ static int present = -1;
+ if (present == -1)
+ {
+ uint_t ui = 0;
+ (void) getisax(&ui, 1);
+ present = (ui & (AV_SPARC_AES)) > 0;
+ }
+
+ return (present);
+ }
+
static int
hw_yf_digest_instruction_present(void)
{
--- a/components/openssl/openssl-1.0.1/engines/t4/eng_t4.c Wed Jul 17 00:17:02 2013 -0700
+++ b/components/openssl/openssl-1.0.1/engines/t4/eng_t4.c Wed Jul 17 15:19:38 2013 -0700
@@ -58,72 +58,24 @@
*/
/*
- * This engine supports SPARC microprocessors that provide AES and other
+ * This engine supports SPARC microprocessors that provide DES and other
* cipher and hash instructions, such as the T4 microprocessor.
*/
#include <openssl/opensslconf.h>
-#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AES_T4) && \
- !defined(OPENSSL_NO_AES)
+#if !defined(OPENSSL_NO_HW)
#include <sys/types.h>
#include <sys/auxv.h> /* getisax() */
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>
-#include <openssl/aes.h>
#include <openssl/engine.h>
-#include "eng_t4_aes_asm.h"
#define T4_LIB_NAME "SPARC T4 engine"
#include "eng_t4_err.c"
-/* Copied from Solaris aes_impl.h */
-#ifndef MAX_AES_NR
-#define MAX_AES_NR 14 /* Maximum number of rounds */
-#endif
-#ifndef MAX_AES_NB
-#define MAX_AES_NB 4 /* Number of columns comprising a state */
-#endif
-
-/* Index for the supported ciphers */
-typedef enum {
- T4_AES_128_CBC,
- T4_AES_192_CBC,
- T4_AES_256_CBC,
-#ifndef SOLARIS_NO_AES_CFB128
- T4_AES_128_CFB128,
- T4_AES_192_CFB128,
- T4_AES_256_CFB128,
-#endif /* !SOLARIS_NO_AES_CFB128 */
- T4_AES_128_CTR,
- T4_AES_192_CTR,
- T4_AES_256_CTR,
- T4_AES_128_ECB,
- T4_AES_192_ECB,
- T4_AES_256_ECB,
- T4_CIPHER_MAX
-} t4_cipher_id;
-
-/* T4 cipher context; must be 8-byte aligned (last field must be uint64_t) */
-typedef struct t4_cipher_ctx {
- t4_cipher_id index;
- uint64_t *iv;
- uint64_t aligned_iv_buffer[2]; /* use if original IV unaligned */
- /* Encryption and decryption key schedule are the same: */
- uint64_t t4_ks[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
-} t4_cipher_ctx_t;
-
-typedef struct t4_cipher {
- t4_cipher_id id;
- int nid;
- int iv_len;
- int min_key_len;
- int max_key_len;
- unsigned long flags;
-} t4_cipher_t;
-
/* Constants used when creating the ENGINE */
static const char *ENGINE_T4_ID = "t4";
static const char *ENGINE_T4_NAME = "SPARC T4 engine support";
@@ -165,10 +117,7 @@
#ifndef DYNAMIC_ENGINE
#pragma inline(t4_bind)
#endif
-static t4_cipher_id get_cipher_index_by_nid(int nid);
-#pragma inline(get_cipher_index_by_nid)
-static void t4_instructions_present(_Bool *aes_present, _Bool *des_present,
- _Bool *montmul_present);
+static void t4_instructions_present(_Bool *des_present, _Bool *montmul_present);
#pragma inline(t4_instructions_present)
/* RSA_METHOD structure used by ENGINE_set_RSA() */
@@ -183,12 +132,6 @@
/* Static variables */
/* This can't be const as NID*ctr is inserted when the engine is initialized */
static int t4_cipher_nids[] = {
- NID_aes_128_cbc, NID_aes_192_cbc, NID_aes_256_cbc,
-#ifndef SOLARIS_NO_AES_CFB128
- NID_aes_128_cfb128, NID_aes_192_cfb128, NID_aes_256_cfb128,
-#endif
- NID_aes_128_ctr, NID_aes_192_ctr, NID_aes_256_ctr,
- NID_aes_128_ecb, NID_aes_192_ecb, NID_aes_256_ecb,
#ifndef OPENSSL_NO_DES
/* Must be at end of list (see t4_des_cipher_count in t4_bind() */
NID_des_cbc, NID_des_ede3_cbc, NID_des_ecb, NID_des_ede3_ecb,
@@ -198,66 +141,6 @@
static int t4_cipher_count =
(sizeof (t4_cipher_nids) / sizeof (t4_cipher_nids[0]));
-/*
- * Cipher Table for all supported symmetric ciphers.
- * Must be in same order as t4_cipher_id.
- */
-static t4_cipher_t t4_cipher_table[] = {
- /* ID NID IV min- max-key flags */
- {T4_AES_128_CBC, NID_aes_128_cbc, 16, 16, 16, 0},
- {T4_AES_192_CBC, NID_aes_192_cbc, 16, 24, 24, 0},
- {T4_AES_256_CBC, NID_aes_256_cbc, 16, 32, 32, 0},
-#ifndef SOLARIS_NO_AES_CFB128
- {T4_AES_128_CFB128, NID_aes_128_cfb128, 16, 16, 16,
- EVP_CIPH_NO_PADDING},
- {T4_AES_192_CFB128, NID_aes_192_cfb128, 16, 24, 24,
- EVP_CIPH_NO_PADDING},
- {T4_AES_256_CFB128, NID_aes_256_cfb128, 16, 32, 32,
- EVP_CIPH_NO_PADDING},
-#endif
- {T4_AES_128_CTR, NID_aes_128_ctr, 16, 16, 16,
- EVP_CIPH_NO_PADDING},
- {T4_AES_192_CTR, NID_aes_192_ctr, 16, 24, 24,
- EVP_CIPH_NO_PADDING},
- {T4_AES_256_CTR, NID_aes_256_ctr, 16, 32, 32,
- EVP_CIPH_NO_PADDING},
- {T4_AES_128_ECB, NID_aes_128_ecb, 0, 16, 16, 0},
- {T4_AES_192_ECB, NID_aes_192_ecb, 0, 24, 24, 0},
- {T4_AES_256_ECB, NID_aes_256_ecb, 0, 32, 32, 0},
-};
-
-
-/* Formal declaration for functions in EVP_CIPHER structure */
-static int t4_cipher_init_aes(EVP_CIPHER_CTX *ctx, const unsigned char *key,
- const unsigned char *iv, int enc);
-
-static int t4_cipher_do_aes_128_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_192_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_256_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-#ifndef SOLARIS_NO_AES_CFB128
-static int t4_cipher_do_aes_128_cfb128(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_192_cfb128(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_256_cfb128(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-#endif
-static int t4_cipher_do_aes_128_ctr(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_192_ctr(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_256_ctr(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_128_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_192_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-static int t4_cipher_do_aes_256_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, size_t inl);
-
/*
* Cipher Algorithms
@@ -274,120 +157,6 @@
* set_asn1_parameters(), get_asn1_parameters(), ctrl(), app_data
*/
-static const EVP_CIPHER t4_aes_128_cbc = {
- NID_aes_128_cbc,
- 16, 16, 16,
- EVP_CIPH_CBC_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_128_cbc, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-static const EVP_CIPHER t4_aes_192_cbc = {
- NID_aes_192_cbc,
- 16, 24, 16,
- EVP_CIPH_CBC_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_192_cbc, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-static const EVP_CIPHER t4_aes_256_cbc = {
- NID_aes_256_cbc,
- 16, 32, 16,
- EVP_CIPH_CBC_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_256_cbc, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-
-#ifndef SOLARIS_NO_AES_CFB128
-static const EVP_CIPHER t4_aes_128_cfb128 = {
- NID_aes_128_cfb128,
- 16, 16, 16,
- EVP_CIPH_CFB_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_128_cfb128, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-static const EVP_CIPHER t4_aes_192_cfb128 = {
- NID_aes_192_cfb128,
- 16, 24, 16,
- EVP_CIPH_CFB_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_192_cfb128, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-static const EVP_CIPHER t4_aes_256_cfb128 = {
- NID_aes_256_cfb128,
- 16, 32, 16,
- EVP_CIPH_CFB_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_256_cfb128, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-#endif /* !SOLARIS_NO_AES_CFB128 */
-
-static EVP_CIPHER t4_aes_128_ctr = {
- NID_aes_128_ctr,
- 16, 16, 16,
- EVP_CIPH_CTR_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_128_ctr, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-static EVP_CIPHER t4_aes_192_ctr = {
- NID_aes_192_ctr,
- 16, 24, 16,
- EVP_CIPH_CTR_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_192_ctr, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-static EVP_CIPHER t4_aes_256_ctr = {
- NID_aes_256_ctr,
- 16, 32, 16,
- EVP_CIPH_CTR_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_256_ctr, NULL,
- sizeof (t4_cipher_ctx_t),
- EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv,
- NULL, NULL
-};
-
-/*
- * ECB modes don't use an Initial Vector, so that's why set_asn1_parameters,
- * get_asn1_parameters, and cleanup fields are set to NULL.
- */
-static const EVP_CIPHER t4_aes_128_ecb = {
- NID_aes_128_ecb,
- 16, 16, 0,
- EVP_CIPH_ECB_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_128_ecb, NULL,
- sizeof (t4_cipher_ctx_t),
- NULL, NULL, NULL, NULL
-};
-static const EVP_CIPHER t4_aes_192_ecb = {
- NID_aes_192_ecb,
- 16, 24, 0,
- EVP_CIPH_ECB_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_192_ecb, NULL,
- sizeof (t4_cipher_ctx_t),
- NULL, NULL, NULL, NULL
-};
-static const EVP_CIPHER t4_aes_256_ecb = {
- NID_aes_256_ecb,
- 16, 32, 0,
- EVP_CIPH_ECB_MODE,
- t4_cipher_init_aes, t4_cipher_do_aes_256_ecb, NULL,
- sizeof (t4_cipher_ctx_t),
- NULL, NULL, NULL, NULL
-};
#ifndef OPENSSL_NO_DES
extern const EVP_CIPHER t4_des_cbc;
@@ -402,13 +171,12 @@
*/
/*
- * Set aes_present, des_present and montmul_present to B_FALSE or B_TRUE
- * depending on whether the current SPARC processor supports AES, DES
+ * Set des_present and montmul_present to B_FALSE or B_TRUE
+ * depending on whether the current SPARC processor supports DES
* and MONTMUL, respectively.
*/
static void
-t4_instructions_present(_Bool *aes_present, _Bool *des_present,
- _Bool *montmul_present)
+t4_instructions_present(_Bool *des_present, _Bool *montmul_present)
{
#ifdef OPENSSL_NO_DES
#undef AV_SPARC_DES
@@ -417,7 +185,6 @@
uint_t ui;
(void) getisax(&ui, 1);
- *aes_present = ((ui & AV_SPARC_AES) != 0);
*des_present = ((ui & AV_SPARC_DES) != 0);
*montmul_present = ((ui & AV_SPARC_MONT) != 0);
}
@@ -443,35 +210,6 @@
}
switch (nid) {
- case NID_aes_128_cbc:
- *cipher = &t4_aes_128_cbc;
- break;
- case NID_aes_192_cbc:
- *cipher = &t4_aes_192_cbc;
- break;
- case NID_aes_256_cbc:
- *cipher = &t4_aes_256_cbc;
- break;
- case NID_aes_128_ecb:
- *cipher = &t4_aes_128_ecb;
- break;
- case NID_aes_192_ecb:
- *cipher = &t4_aes_192_ecb;
- break;
- case NID_aes_256_ecb:
- *cipher = &t4_aes_256_ecb;
- break;
-#ifndef SOLARIS_NO_AES_CFB128
- case NID_aes_128_cfb128:
- *cipher = &t4_aes_128_cfb128;
- break;
- case NID_aes_192_cfb128:
- *cipher = &t4_aes_192_cfb128;
- break;
- case NID_aes_256_cfb128:
- *cipher = &t4_aes_256_cfb128;
- break;
-#endif /* !SOLARIS_NO_AES_CFB128 */
#ifndef OPENSSL_NO_DES
case NID_des_cbc:
*cipher = &t4_des_cbc;
@@ -486,15 +224,6 @@
*cipher = &t4_des3_ecb;
break;
#endif /* !OPENSSL_NO_DES */
- case NID_aes_128_ctr:
- *cipher = &t4_aes_128_ctr;
- break;
- case NID_aes_192_ctr:
- *cipher = &t4_aes_192_ctr;
- break;
- case NID_aes_256_ctr:
- *cipher = &t4_aes_256_ctr;
- break;
default:
/* cipher not supported */
*cipher = NULL;
@@ -505,260 +234,6 @@
}
-/* Called by t4_cipher_init_aes() */
-static t4_cipher_id
-get_cipher_index_by_nid(int nid)
-{
- t4_cipher_id i;
-
- for (i = (t4_cipher_id)0; i < T4_CIPHER_MAX; ++i)
- if (t4_cipher_table[i].nid == nid)
- return (i);
- return (T4_CIPHER_MAX);
-}
-
-
-/* ARGSUSED2 */
-static int
-t4_cipher_init_aes(EVP_CIPHER_CTX *ctx, const unsigned char *key,
- const unsigned char *iv, int enc)
-{
- t4_cipher_ctx_t *tctx = ctx->cipher_data;
- uint64_t *t4_ks = tctx->t4_ks;
- t4_cipher_t *t4_cipher;
- t4_cipher_id index;
- int key_len = ctx->key_len;
- uint64_t aligned_key_buffer[4]; /* 16, 24, or 32 bytes long */
- uint64_t *aligned_key;
-
- if (key == NULL) {
- T4err(T4_F_CIPHER_INIT_AES, T4_R_CIPHER_KEY);
- return (0);
- }
-
- /* Get the cipher entry index in t4_cipher_table from nid */
- index = get_cipher_index_by_nid(ctx->cipher->nid);
- if (index >= T4_CIPHER_MAX) {
- T4err(T4_F_CIPHER_INIT_AES, T4_R_CIPHER_NID);
- return (0); /* Error */
- }
- t4_cipher = &t4_cipher_table[index];
-
- /* Check key size and iv size */
- if (ctx->cipher->iv_len < t4_cipher->iv_len) {
- T4err(T4_F_CIPHER_INIT_AES, T4_R_IV_LEN_INCORRECT);
- return (0); /* Error */
- }
- if ((key_len < t4_cipher->min_key_len) ||
- (key_len > t4_cipher->max_key_len)) {
- T4err(T4_F_CIPHER_INIT_AES, T4_R_KEY_LEN_INCORRECT);
- return (0); /* Error */
- }
-
- /* Set cipher flags, if any */
- ctx->flags |= t4_cipher->flags;
-
- /* Align the key */
- if (((unsigned long)key & 0x7) == 0) /* already aligned */
- aligned_key = (uint64_t *)key;
- else { /* key is not 8-byte aligned */
-#ifdef DEBUG_T4
- (void) fprintf(stderr, "T4: key is not 8 byte aligned\n");
-#endif
- (void) memcpy(aligned_key_buffer, key, key_len);
- aligned_key = aligned_key_buffer;
- }
-
-
- /*
- * Expand the key schedule.
- * Copy original key to start of t4_ks key schedule. Note that the
- * encryption and decryption key schedule are the same for T4.
- */
- switch (key_len) {
- case 16:
- t4_aes_expand128(&t4_ks[2],
- (const uint32_t *)aligned_key);
- t4_ks[0] = aligned_key[0];
- t4_ks[1] = aligned_key[1];
- break;
- case 24:
- t4_aes_expand192(&t4_ks[3],
- (const uint32_t *)aligned_key);
- t4_ks[0] = aligned_key[0];
- t4_ks[1] = aligned_key[1];
- t4_ks[2] = aligned_key[2];
- break;
- case 32:
- t4_aes_expand256(&t4_ks[4],
- (const uint32_t *)aligned_key);
- t4_ks[0] = aligned_key[0];
- t4_ks[1] = aligned_key[1];
- t4_ks[2] = aligned_key[2];
- t4_ks[3] = aligned_key[3];
- break;
- default:
- T4err(T4_F_CIPHER_INIT_AES, T4_R_CIPHER_KEY);
- return (0);
- }
-
- /* Save index to cipher */
- tctx->index = index;
-
- /* Align IV, if needed */
- if (t4_cipher->iv_len <= 0) { /* no IV (such as with ECB mode) */
- tctx->iv = NULL;
- } else if (((unsigned long)ctx->iv & 0x7) == 0) { /* already aligned */
- tctx->iv = (uint64_t *)ctx->iv;
- } else {
- /* IV is not 8 byte aligned */
- (void) memcpy(tctx->aligned_iv_buffer, ctx->iv,
- ctx->cipher->iv_len);
- tctx->iv = tctx->aligned_iv_buffer;
-#ifdef DEBUG_T4
- (void) fprintf(stderr,
- "t4_cipher_init_aes: IV is not 8 byte aligned\n");
- (void) fprintf(stderr,
- "t4_cipher_init_aes: ctx->cipher->iv_len =%d\n",
- ctx->cipher->iv_len);
- (void) fprintf(stderr, "t4_cipher_init_aes: after "
- "re-alignment, tctx->iv = %p\n", (void *)tctx->iv);
-#endif /* DEBUG_T4 */
- }
-
- return (1);
-}
-
-
-/*
- * ENCRYPT_UPDATE or DECRYPT_UPDATE
- */
-#define T4_CIPHER_DO_AES(t4_cipher_do_aes, t4_aes_load_keys_for_encrypt, \
- t4_aes_encrypt, t4_aes_load_keys_for_decrypt, t4_aes_decrypt, iv) \
-static int \
-t4_cipher_do_aes(EVP_CIPHER_CTX *ctx, unsigned char *out, \
- const unsigned char *in, size_t inl) \
-{ \
- t4_cipher_ctx_t *tctx = ctx->cipher_data; \
- uint64_t *t4_ks = tctx->t4_ks; \
- unsigned long outl = inl; \
- unsigned char *bufin_alloc = NULL, *bufout_alloc = NULL; \
- unsigned char *bufin, *bufout; \
- \
- /* "in" and "out" must be 8 byte aligned */ \
- if (((unsigned long)in & 0x7) == 0) { /* already aligned */ \
- bufin = (unsigned char *)in; \
- } else { /* "in" is not 8 byte aligned */ \
- if (((unsigned long)out & 0x7) == 0) { /* aligned */ \
- /* use output buffer for input */ \
- bufin = out; \
- } else { \
- bufin = bufin_alloc = OPENSSL_malloc(inl); \
- if (bufin_alloc == NULL) \
- return (0); /* error */ \
- } \
- (void) memcpy(bufin, in, inl); \
- } \
- \
- if (((unsigned long)out & 0x7) == 0) { /* already aligned */ \
- bufout = out; \
- } else { /* "out" is not 8 byte aligned */ \
- if (bufin_alloc != NULL) { \
- /* use allocated input buffer for output */ \
- bufout = bufin_alloc; \
- } else { \
- bufout = bufout_alloc = OPENSSL_malloc(outl); \
- if (bufout_alloc == NULL) { \
- OPENSSL_free(bufin_alloc); \
- return (0); /* error */ \
- } \
- } \
- } \
- \
- /* Data length must be an even multiple of block size. */ \
- if ((inl & 0xf) != 0) { \
- OPENSSL_free(bufout_alloc); \
- OPENSSL_free(bufin_alloc); \
- T4err(T4_F_CIPHER_DO_AES, T4_R_NOT_BLOCKSIZE_LENGTH); \
- return (0); \
- } \
- \
- if (ctx->encrypt) { \
- t4_aes_load_keys_for_encrypt(t4_ks); \
- t4_aes_encrypt(t4_ks, (uint64_t *)bufin, \
- (uint64_t *)bufout, (size_t)inl, iv); \
- } else { /* decrypt */ \
- t4_aes_load_keys_for_decrypt(t4_ks); \
- t4_aes_decrypt(t4_ks, (uint64_t *)bufin, \
- (uint64_t *)bufout, (size_t)inl, iv); \
- } \
- \
- /* Cleanup */ \
- if (bufin_alloc != NULL) { \
- if (bufout == bufin_alloc) \
- (void) memcpy(out, bufout, outl); \
- OPENSSL_free(bufin_alloc); \
- } \
- if (bufout_alloc != NULL) { \
- (void) memcpy(out, bufout_alloc, outl); \
- OPENSSL_free(bufout_alloc); \
- } \
- \
- return (1); \
-}
-
-
-/* AES CBC mode. */
-T4_CIPHER_DO_AES(t4_cipher_do_aes_128_cbc,
- t4_aes128_load_keys_for_encrypt, t4_aes128_cbc_encrypt,
- t4_aes128_load_keys_for_decrypt, t4_aes128_cbc_decrypt, tctx->iv)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_192_cbc,
- t4_aes192_load_keys_for_encrypt, t4_aes192_cbc_encrypt,
- t4_aes192_load_keys_for_decrypt, t4_aes192_cbc_decrypt, tctx->iv)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_256_cbc,
- t4_aes256_load_keys_for_encrypt, t4_aes256_cbc_encrypt,
- t4_aes256_load_keys_for_decrypt, t4_aes256_cbc_decrypt, tctx->iv)
-
-/*
- * AES CFB128 mode.
- * CFB128 decrypt uses load_keys_for_encrypt() as the mode uses
- * the raw AES encrypt operation for the decryption, too.
- */
-#ifndef SOLARIS_NO_AES_CFB128
-T4_CIPHER_DO_AES(t4_cipher_do_aes_128_cfb128,
- t4_aes128_load_keys_for_encrypt, t4_aes128_cfb128_encrypt,
- t4_aes128_load_keys_for_encrypt, t4_aes128_cfb128_decrypt, tctx->iv)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_192_cfb128,
- t4_aes192_load_keys_for_encrypt, t4_aes192_cfb128_encrypt,
- t4_aes192_load_keys_for_encrypt, t4_aes192_cfb128_decrypt, tctx->iv)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_256_cfb128,
- t4_aes256_load_keys_for_encrypt, t4_aes256_cfb128_encrypt,
- t4_aes256_load_keys_for_encrypt, t4_aes256_cfb128_decrypt, tctx->iv)
-#endif /* !SOLARIS_NO_AES_CFB128 */
-
-/* AES CTR mode. */
-T4_CIPHER_DO_AES(t4_cipher_do_aes_128_ctr,
- t4_aes128_load_keys_for_encrypt, t4_aes128_ctr_crypt,
- t4_aes128_load_keys_for_decrypt, t4_aes128_ctr_crypt, tctx->iv)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_192_ctr,
- t4_aes192_load_keys_for_encrypt, t4_aes192_ctr_crypt,
- t4_aes192_load_keys_for_decrypt, t4_aes192_ctr_crypt, tctx->iv)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_256_ctr,
- t4_aes256_load_keys_for_encrypt, t4_aes256_ctr_crypt,
- t4_aes256_load_keys_for_decrypt, t4_aes256_ctr_crypt, tctx->iv)
-
-/* AES ECB mode. */
-T4_CIPHER_DO_AES(t4_cipher_do_aes_128_ecb,
- t4_aes128_load_keys_for_encrypt, t4_aes128_ecb_encrypt,
- t4_aes128_load_keys_for_decrypt, t4_aes128_ecb_decrypt, NULL)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_192_ecb,
- t4_aes192_load_keys_for_encrypt, t4_aes192_ecb_encrypt,
- t4_aes192_load_keys_for_decrypt, t4_aes192_ecb_decrypt, NULL)
-T4_CIPHER_DO_AES(t4_cipher_do_aes_256_ecb,
- t4_aes256_load_keys_for_encrypt, t4_aes256_ecb_encrypt,
- t4_aes256_load_keys_for_decrypt, t4_aes256_ecb_decrypt, NULL)
-
-
/*
* Is the t4 engine available?
* Passed to ENGINE_set_init_function().
@@ -789,12 +264,12 @@
static int
t4_bind(ENGINE *e)
{
- _Bool aes_engage, des_engage, montmul_engage;
+ _Bool des_engage, montmul_engage;
- t4_instructions_present(&aes_engage, &des_engage, &montmul_engage);
+ t4_instructions_present(&des_engage, &montmul_engage);
#ifdef DEBUG_T4
(void) fprintf(stderr,
- "t4_bind: engage aes=%d, des=%d\n", aes_engage, des_engage);
+ "t4_bind: engage des=%d\n", des_engage);
#endif
#ifndef OPENSSL_NO_DES
if (!des_engage) { /* Remove DES ciphers from list */
@@ -814,9 +289,9 @@
/* Register T4 engine ID, name, and functions */
if (!ENGINE_set_id(e, ENGINE_T4_ID) ||
!ENGINE_set_name(e,
- aes_engage ? ENGINE_T4_NAME: ENGINE_NO_T4_NAME) ||
+ des_engage ? ENGINE_T4_NAME: ENGINE_NO_T4_NAME) ||
!ENGINE_set_init_function(e, t4_init) ||
- (aes_engage && !ENGINE_set_ciphers(e, t4_get_all_ciphers)) ||
+ (des_engage && !ENGINE_set_ciphers(e, t4_get_all_ciphers)) ||
#ifndef OPENSSL_NO_RSA
(montmul_engage && !ENGINE_set_RSA(e, t4_RSA())) ||
#endif /* OPENSSL_NO_RSA */
@@ -860,4 +335,4 @@
IMPLEMENT_DYNAMIC_BIND_FN(t4_bind_helper)
#endif /* DYNAMIC_ENGINE */
#endif /* COMPILE_HW_T4 */
-#endif /* !OPENSSL_NO_HW && !OPENSSL_NO_HW_AES_T4 && !OPENSSL_NO_AES */
+#endif /* !OPENSSL_NO_HW */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-1.0.1/inline-t4/aest4-sparcv9.pl Wed Jul 17 15:19:38 2013 -0700
@@ -0,0 +1,902 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by David S. Miller <[email protected]> and Andy Polyakov
+# <[email protected]>. The module is licensed under 2-clause BSD
+# license. October 2012. All rights reserved.
+# ====================================================================
+
+######################################################################
+# AES for SPARC T4.
+#
+# AES round instructions complete in 3 cycles and can be issued every
+# cycle. It means that round calculations should take 4*rounds cycles,
+# because any given round instruction depends on result of *both*
+# previous instructions:
+#
+# |0 |1 |2 |3 |4
+# |01|01|01|
+# |23|23|23|
+# |01|01|...
+# |23|...
+#
+# Provided that fxor [with IV] takes 3 cycles to complete, critical
+# path length for CBC encrypt would be 3+4*rounds, or in other words
+# it should process one byte in at least (3+4*rounds)/16 cycles. This
+# estimate doesn't account for "collateral" instructions, such as
+# fetching input from memory, xor-ing it with zero-round key and
+# storing the result. Yet, *measured* performance [for data aligned
+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
+#
+# 128-bit key 192- 256-
+# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Out-of-order execution logic managed to fully overlap "collateral"
+# instructions with those on critical path. Amazing!
+#
+# As with Intel AES-NI, question is if it's possible to improve
+# performance of parallelizeable modes by interleaving round
+# instructions. Provided round instruction latency and throughput
+# optimal interleave factor is 2. But can we expect 2x performance
+# improvement? Well, as round instructions can be issued one per
+# cycle, they don't saturate the 2-way issue pipeline and therefore
+# there is room for "collateral" calculations... Yet, 2x speed-up
+# over CBC encrypt remains unattaintable:
+#
+# 128-bit key 192- 256-
+# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
+# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
+# (*) numbers after slash are for
+# misaligned data.
+#
+# Estimates based on amount of instructions under assumption that
+# round instructions are not pairable with any other instruction
+# suggest that latter is the actual case and pipeline runs
+# underutilized. It should be noted that T4 out-of-order execution
+# logic is so capable that performance gain from 2x interleave is
+# not even impressive, ~7-13% over non-interleaved code, largest
+# for 256-bit keys.
+
+# To anchor to something else, software implementation processes
+# one byte in 29 cycles with 128-bit key on same processor. Intel
+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
+# in 0.93, naturally with AES-NI.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+&asm_init(@ARGV);
+
+$::evp=1; # if $evp is set to 0, script generates module with
+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
+# points. These however are not fully compatible with openssl/aes.h,
+# because they expect AES_KEY to be aligned at 64-bit boundary. When
+# used through EVP, alignment is arranged at EVP layer. Second thing
+# that is arranged by EVP is at least 32-bit alignment of IV.
+
+######################################################################
+# single-round subroutines
+#
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
+
+$code=<<___;
+.text
+
+.globl aes_t4_encrypt
+.align 32
+aes_t4_encrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Lenc:
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_eround01 %f16, %f4, %f2, %f0
+ aes_eround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Lenc
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_eround01 %f12, %f0, %f2, %f4
+ aes_eround23 %f14, %f0, %f2, %f2
+ aes_eround01_l %f16, %f4, %f2, %f0
+ aes_eround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_encrypt,#function
+.size aes_t4_encrypt,.-aes_t4_encrypt
+
+.globl aes_t4_decrypt
+.align 32
+aes_t4_decrypt:
+ andcc $inp, 7, %g1 ! is input aligned?
+ andn $inp, 7, $inp
+
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+
+ ldx [$inp + 0], %o4
+ bz,pt %icc, 1f
+ ldx [$inp + 8], %o5
+ ldx [$inp + 16], $inp
+ sll %g1, 3, %g1
+ sub %g0, %g1, %o3
+ sllx %o4, %g1, %o4
+ sllx %o5, %g1, %g1
+ srlx %o5, %o3, %o5
+ srlx $inp, %o3, %o3
+ or %o5, %o4, %o4
+ or %o3, %g1, %o5
+1:
+ ld [$key + 240], $rounds
+ ldd [$key + 16], %f12
+ ldd [$key + 24], %f14
+ xor %g4, %o4, %o4
+ xor %g5, %o5, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ srl $rounds, 1, $rounds
+ ldd [$key + 32], %f16
+ sub $rounds, 1, $rounds
+ ldd [$key + 40], %f18
+ add $key, 48, $key
+
+.Ldec:
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ ldd [$key + 0], %f12
+ ldd [$key + 8], %f14
+ sub $rounds,1,$rounds
+ aes_dround01 %f16, %f4, %f2, %f0
+ aes_dround23 %f18, %f4, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ brnz,pt $rounds, .Ldec
+ add $key, 32, $key
+
+ andcc $out, 7, $tmp ! is output aligned?
+ aes_dround01 %f12, %f0, %f2, %f4
+ aes_dround23 %f14, %f0, %f2, %f2
+ aes_dround01_l %f16, %f4, %f2, %f0
+ aes_dround23_l %f18, %f4, %f2, %f2
+
+ bnz,pn %icc, 2f
+ nop
+
+ std %f0, [$out + 0]
+ retl
+ std %f2, [$out + 8]
+
+2: alignaddrl $out, %g0, $out
+ mov 0xff, $mask
+ srl $mask, $tmp, $mask
+
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $mask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $mask, $mask
+ retl
+ stda %f8, [$out + $mask]0xc0 ! partial store
+.type aes_t4_decrypt,#function
+.size aes_t4_decrypt,.-aes_t4_decrypt
+___
+}
+
+######################################################################
+# key setup subroutines
+#
+{
+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
+$code.=<<___;
+.globl aes_t4_set_encrypt_key
+.align 32
+aes_t4_set_encrypt_key:
+.Lset_encrypt_key:
+ and $inp, 7, $tmp
+ alignaddr $inp, %g0, $inp
+ cmp $bits, 192
+ ldd [$inp + 0], %f0
+ bl,pt %icc,.L128
+ ldd [$inp + 8], %f2
+
+ be,pt %icc,.L192
+ ldd [$inp + 16], %f4
+ brz,pt $tmp, .L256aligned
+ ldd [$inp + 24], %f6
+
+ ldd [$inp + 32], %f8
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f8, %f6
+.L256aligned:
+___
+for ($i=0; $i<6; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ aes_kexpand0 %f4, %f2, %f4
+ std %f6, [$out + `32*$i+24`]
+ aes_kexpand2 %f6, %f4, %f6
+___
+}
+$code.=<<___;
+ std %f0, [$out + `32*$i+0`]
+ aes_kexpand1 %f0, %f6, $i, %f0
+ std %f2, [$out + `32*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `32*$i+16`]
+ std %f6, [$out + `32*$i+24`]
+ std %f0, [$out + `32*$i+32`]
+ std %f2, [$out + `32*$i+40`]
+
+ mov 14, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L192:
+ brz,pt $tmp, .L192aligned
+ nop
+
+ ldd [$inp + 24], %f6
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+.L192aligned:
+___
+for ($i=0; $i<7; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ aes_kexpand2 %f4, %f2, %f4
+___
+}
+$code.=<<___;
+ std %f0, [$out + `24*$i+0`]
+ aes_kexpand1 %f0, %f4, $i, %f0
+ std %f2, [$out + `24*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+ std %f4, [$out + `24*$i+16`]
+ std %f0, [$out + `24*$i+24`]
+ std %f2, [$out + `24*$i+32`]
+
+ mov 12, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+
+.align 16
+.L128:
+ brz,pt $tmp, .L128aligned
+ nop
+
+ ldd [$inp + 16], %f4
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+.L128aligned:
+___
+for ($i=0; $i<10; $i++) {
+ $code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ aes_kexpand1 %f0, %f2, $i, %f0
+ std %f2, [$out + `16*$i+8`]
+ aes_kexpand2 %f2, %f0, %f2
+___
+}
+$code.=<<___;
+ std %f0, [$out + `16*$i+0`]
+ std %f2, [$out + `16*$i+8`]
+
+ mov 10, $tmp
+ st $tmp, [$out + 240]
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_encrypt_key,#function
+.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
+
+.globl aes_t4_set_decrypt_key
+.align 32
+aes_t4_set_decrypt_key:
+ mov %o7, %o5
+ call .Lset_encrypt_key
+ nop
+
+ mov %o5, %o7
+ sll $tmp, 4, $inp ! $tmp is number of rounds
+ add $tmp, 2, $tmp
+ add $out, $inp, $inp ! $inp=$out+16*rounds
+ srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
+
+.Lkey_flip:
+ ldd [$out + 0], %f0
+ ldd [$out + 8], %f2
+ ldd [$out + 16], %f4
+ ldd [$out + 24], %f6
+ ldd [$inp + 0], %f8
+ ldd [$inp + 8], %f10
+ ldd [$inp - 16], %f12
+ ldd [$inp - 8], %f14
+ sub $tmp, 1, $tmp
+ std %f0, [$inp + 0]
+ std %f2, [$inp + 8]
+ std %f4, [$inp - 16]
+ std %f6, [$inp - 8]
+ std %f8, [$out + 0]
+ std %f10, [$out + 8]
+ std %f12, [$out + 16]
+ std %f14, [$out + 24]
+ add $out, 32, $out
+ brnz $tmp, .Lkey_flip
+ sub $inp, 32, $inp
+
+ retl
+ xor %o0, %o0, %o0
+.type aes_t4_set_decrypt_key,#function
+.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
+___
+}
+
+{{{
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
+
+$code.=<<___;
+.align 32
+_aes128_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<22;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes128_loadkey,#function
+.size _aes128_loadkey,.-_aes128_loadkey
+_aes128_load_enckey=_aes128_loadkey
+_aes128_load_deckey=_aes128_loadkey
+
+.align 32
+_aes128_encrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f4
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f54, %f4, %f2, %f2
+.type _aes128_encrypt_1x,#function
+.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
+
+.align 32
+_aes128_encrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f48, %f0, %f2, %f8
+ aes_eround23 %f50, %f0, %f2, %f2
+ aes_eround01 %f48, %f4, %f6, %f10
+ aes_eround23 %f50, %f4, %f6, %f6
+ aes_eround01_l %f52, %f8, %f2, %f0
+ aes_eround23_l %f54, %f8, %f2, %f2
+ aes_eround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f54, %f10, %f6, %f6
+.type _aes128_encrypt_2x,#function
+.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
+
+.align 32
+_aes128_decrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f4
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01_l %f52, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f54, %f4, %f2, %f2
+.type _aes128_decrypt_1x,#function
+.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
+
+.align 32
+_aes128_decrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f48, %f0, %f2, %f8
+ aes_dround23 %f50, %f0, %f2, %f2
+ aes_dround01 %f48, %f4, %f6, %f10
+ aes_dround23 %f50, %f4, %f6, %f6
+ aes_dround01_l %f52, %f8, %f2, %f0
+ aes_dround23_l %f54, %f8, %f2, %f2
+ aes_dround01_l %f52, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f54, %f10, %f6, %f6
+.type _aes128_decrypt_2x,#function
+.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
+
+.align 32
+_aes192_loadkey:
+_aes256_loadkey:
+ ldx [$key + 0], %g4
+ ldx [$key + 8], %g5
+___
+for ($i=2; $i<26;$i++) { # load key schedule
+ $code.=<<___;
+ ldd [$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+ retl
+ nop
+.type _aes192_loadkey,#function
+.size _aes192_loadkey,.-_aes192_loadkey
+_aes192_load_enckey=_aes192_loadkey
+_aes192_load_deckey=_aes192_loadkey
+_aes256_load_enckey=_aes192_loadkey
+_aes256_load_deckey=_aes192_loadkey
+
+.align 32
+_aes192_encrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f4
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_eround23_l %f62, %f4, %f2, %f2
+.type _aes192_encrypt_1x,#function
+.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
+
+.align 32
+_aes192_encrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f56, %f0, %f2, %f8
+ aes_eround23 %f58, %f0, %f2, %f2
+ aes_eround01 %f56, %f4, %f6, %f10
+ aes_eround23 %f58, %f4, %f6, %f6
+ aes_eround01_l %f60, %f8, %f2, %f0
+ aes_eround23_l %f62, %f8, %f2, %f2
+ aes_eround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_eround23_l %f62, %f10, %f6, %f6
+.type _aes192_encrypt_2x,#function
+.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
+
+.align 32
+_aes192_decrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f4
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01_l %f60, %f4, %f2, %f0
+ retl
+ aes_dround23_l %f62, %f4, %f2, %f2
+.type _aes192_decrypt_1x,#function
+.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
+
+.align 32
+_aes192_decrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f56, %f0, %f2, %f8
+ aes_dround23 %f58, %f0, %f2, %f2
+ aes_dround01 %f56, %f4, %f6, %f10
+ aes_dround23 %f58, %f4, %f6, %f6
+ aes_dround01_l %f60, %f8, %f2, %f0
+ aes_dround23_l %f62, %f8, %f2, %f2
+ aes_dround01_l %f60, %f10, %f6, %f4
+ retl
+ aes_dround23_l %f62, %f10, %f6, %f6
+.type _aes192_decrypt_2x,#function
+.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
+
+.align 32
+_aes256_encrypt_1x:
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f4, %f2, %f0
+ aes_eround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f4
+ aes_eround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f4, %f2, %f0
+ aes_eround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_1x,#function
+.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
+
+.align 32
+_aes256_encrypt_2x:
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_eround01 %f20, %f8, %f2, %f0
+ aes_eround23 %f22, %f8, %f2, %f2
+ aes_eround01 %f20, %f10, %f6, %f4
+ aes_eround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_eround01 %f16, %f0, %f2, %f8
+ aes_eround23 %f18, %f0, %f2, %f2
+ aes_eround01 %f16, %f4, %f6, %f10
+ aes_eround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_eround01_l %f20, %f8, %f2, %f0
+ aes_eround23_l %f22, %f8, %f2, %f2
+ aes_eround01_l %f20, %f10, %f6, %f4
+ aes_eround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_encrypt_2x,#function
+.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
+
+.align 32
+_aes256_decrypt_1x:
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f4, %f2, %f0
+ aes_dround23 %f22, %f4, %f2, %f2
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f4
+ aes_dround23 %f18, %f0, %f2, %f2
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f4, %f2, %f0
+ aes_dround23_l %f22, %f4, %f2, %f2
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_1x,#function
+.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
+
+.align 32
+_aes256_decrypt_2x:
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 208], %f16
+ ldd [$key + 216], %f18
+ aes_dround01 %f20, %f8, %f2, %f0
+ aes_dround23 %f22, %f8, %f2, %f2
+ aes_dround01 %f20, %f10, %f6, %f4
+ aes_dround23 %f22, %f10, %f6, %f6
+ ldd [$key + 224], %f20
+ ldd [$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+ $code.=<<___;
+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+ aes_dround01 %f16, %f0, %f2, %f8
+ aes_dround23 %f18, %f0, %f2, %f2
+ aes_dround01 %f16, %f4, %f6, %f10
+ aes_dround23 %f18, %f4, %f6, %f6
+ ldd [$key + 16], %f16
+ ldd [$key + 24], %f18
+ aes_dround01_l %f20, %f8, %f2, %f0
+ aes_dround23_l %f22, %f8, %f2, %f2
+ aes_dround01_l %f20, %f10, %f6, %f4
+ aes_dround23_l %f22, %f10, %f6, %f6
+ ldd [$key + 32], %f20
+ retl
+ ldd [$key + 40], %f22
+.type _aes256_decrypt_2x,#function
+.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
+___
+
+&alg_cbc_encrypt_implement("aes",128);
+&alg_cbc_encrypt_implement("aes",192);
+&alg_cbc_encrypt_implement("aes",256);
+
+&alg_cbc_decrypt_implement("aes",128);
+&alg_cbc_decrypt_implement("aes",192);
+&alg_cbc_decrypt_implement("aes",256);
+
+if ($::evp) {
+ &alg_ctr32_implement("aes",128);
+ &alg_ctr32_implement("aes",192);
+ &alg_ctr32_implement("aes",256);
+}
+}}}
+
+if (!$::evp) {
+$code.=<<___;
+.global AES_encrypt
+AES_encrypt=aes_t4_encrypt
+.global AES_decrypt
+AES_decrypt=aes_t4_decrypt
+.global AES_set_encrypt_key
+.align 32
+AES_set_encrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_encrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_encrypt_key,#function
+.size AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global AES_set_decrypt_key
+.align 32
+AES_set_decrypt_key:
+ andcc %o2, 7, %g0 ! check alignment
+ bnz,a,pn %icc, 1f
+ mov -1, %o0
+ brz,a,pn %o0, 1f
+ mov -1, %o0
+ brz,a,pn %o2, 1f
+ mov -1, %o0
+ andncc %o1, 0x1c0, %g0
+ bnz,a,pn %icc, 1f
+ mov -2, %o0
+ cmp %o1, 128
+ bl,a,pn %icc, 1f
+ mov -2, %o0
+ b aes_t4_set_decrypt_key
+ nop
+1: retl
+ nop
+.type AES_set_decrypt_key,#function
+.size AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
+
+$code.=<<___;
+.globl AES_cbc_encrypt
+.align 32
+AES_cbc_encrypt:
+ ld [$key + 240], %g1
+ nop
+ brz $enc, .Lcbc_decrypt
+ cmp %g1, 12
+
+ bl,pt %icc, aes128_t4_cbc_encrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_encrypt
+ nop
+ ba aes256_t4_cbc_encrypt
+ nop
+
+.Lcbc_decrypt:
+ bl,pt %icc, aes128_t4_cbc_decrypt
+ nop
+ be,pn %icc, aes192_t4_cbc_decrypt
+ nop
+ ba aes256_t4_cbc_decrypt
+ nop
+.type AES_cbc_encrypt,#function
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+$code.=<<___;
+.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
+.align 4
+___
+
+&emit_assembler();
+
+close STDOUT;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-1.0.1/inline-t4/sparcv9_modes.pl Wed Jul 17 15:19:38 2013 -0700
@@ -0,0 +1,1680 @@
+#!/usr/bin/env perl
+
+# Specific modes implementations for SPARC Architecture 2011. There
+# is T4 dependency though, an ASI value that is not specified in the
+# Architecture Manual. But as SPARC universe is rather monocultural,
+# we imply that processor capable of executing crypto instructions
+# can handle the ASI in question as well. This means that we ought to
+# keep eyes open when new processors emerge...
+#
+# As for above mentioned ASI. It's so called "block initializing
+# store" which cancels "read" in "read-update-write" on cache lines.
+# This is "cooperative" optimization, as it reduces overall pressure
+# on memory interface. Benefits can't be observed/quantified with
+# usual benchmarks, on the contrary you can notice that single-thread
+# performance for parallelizable modes is ~1.5% worse for largest
+# block sizes [though few percent better for not so long ones]. All
+# this based on suggestions from David Miller.
+
+sub asm_init { # to be called with @ARGV as argument
+ for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
+ if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
+ else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
+}
+
+# unified interface
+my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
+# local variables
+my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
+
+sub alg_cbc_encrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_cbc_encrypt
+.align 32
+${alg}${bits}_t4_cbc_encrypt:
+ save %sp, -$::frame, %sp
+ sub $inp, $out, $blk_init ! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+ andcc $ivec, 7, $ivoff
+ alignaddr $ivec, %g0, $ivec
+
+ ldd [$ivec + 0], %f0 ! load ivec
+ bz,pt %icc, 1f
+ ldd [$ivec + 8], %f2
+ ldd [$ivec + 16], %f4
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+1:
+___
+$::code.=<<___ if ($::evp);
+ ld [$ivec + 0], %f0
+ ld [$ivec + 4], %f1
+ ld [$ivec + 8], %f2
+ ld [$ivec + 12], %f3
+___
+$::code.=<<___;
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_enckey
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 127
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
+ brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ alignaddrl $out, %g0, $out
+ srlx $len, 4, $len
+ prefetch [$out], 22
+
+.L${bits}_cbc_enc_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_encrypt_1x
+ add $inp, 16, $inp
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_cbc_enc_loop
+ add $out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3f
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_enc_loop+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3f
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+ ret
+ restore
+
+.align 16
+3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
+ mov 0xff, $omask
+ srl $omask, $ivoff, $omask
+ faligndata %f0, %f0, %f4
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$ivec + $omask]0xc0
+ std %f6, [$ivec + 8]
+ add $ivec, 16, $ivec
+ orn %g0, $omask, $omask
+ stda %f8, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}cbc_enc_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+
+.L${bits}_cbc_enc_blk_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 5f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+5:
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_encrypt_1x
+ add $inp, 16, $inp
+ sub $len, 1, $len
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ brnz,pt $len, .L${bits}_cbc_enc_blk_loop
+ add $out, 8, $out
+
+ membar #StoreLoad|#StoreStore
+ brnz,pt $blk_init, .L${bits}_cbc_enc_loop
+ mov $blk_init, $len
+___
+$::code.=<<___ if ($::evp);
+ st %f0, [$ivec + 0]
+ st %f1, [$ivec + 4]
+ st %f2, [$ivec + 8]
+ st %f3, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3b
+ nop
+
+ std %f0, [$ivec + 0] ! write out ivec
+ std %f2, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_cbc_encrypt,#function
+.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
+___
+}
+
+sub alg_cbc_decrypt_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_cbc_decrypt
+.align 32
+${alg}${bits}_t4_cbc_decrypt:
+ save %sp, -$::frame, %sp
+ sub $inp, $out, $blk_init ! $inp!=$out
+___
+$::code.=<<___ if (!$::evp);
+ andcc $ivec, 7, $ivoff
+ alignaddr $ivec, %g0, $ivec
+
+ ldd [$ivec + 0], %f12 ! load ivec
+ bz,pt %icc, 1f
+ ldd [$ivec + 8], %f14
+ ldd [$ivec + 16], %f0
+ faligndata %f12, %f14, %f12
+ faligndata %f14, %f0, %f14
+1:
+___
+$::code.=<<___ if ($::evp);
+ ld [$ivec + 0], %f12 ! load ivec
+ ld [$ivec + 4], %f13
+ ld [$ivec + 8], %f14
+ ld [$ivec + 12], %f15
+___
+$::code.=<<___;
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_deckey
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+ srlx $len, 4, $len
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_cbc_dec_loop2x
+ prefetch [$out], 22
+.L${bits}_cbc_dec_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g4, %o0, %o2 ! ^= rk[0]
+ xor %g5, %o1, %o3
+ movxtod %o2, %f0
+ movxtod %o3, %f2
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_decrypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o0, %f12
+ movxtod %o1, %f14
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
+ add $out, 16, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_cbc_dec_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ xor %g4, %o0, %o4 ! ^= rk[0]
+ xor %g5, %o1, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ xor %g4, %o2, %o4
+ xor %g5, %o3, %o5
+ movxtod %o4, %f4
+ movxtod %o5, %f6
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_decrypt_2x
+ add $inp, 32, $inp
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
+ add $out, 32, $out
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f6, %f6
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f0, [$out + 8]
+ std %f2, [$out + 16]
+ std %f4, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f6, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
+ orn %g0, $omask, $omask
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0]
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+ ret
+ restore
+
+.align 16
+.L${bits}_cbc_dec_unaligned_ivec:
+ alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
+ mov 0xff, $omask
+ srl $omask, $ivoff, $omask
+ faligndata %f12, %f12, %f0
+ faligndata %f12, %f14, %f2
+ faligndata %f14, %f14, %f4
+ stda %f0, [$ivec + $omask]0xc0
+ std %f2, [$ivec + 8]
+ add $ivec, 16, $ivec
+ orn %g0, $omask, $omask
+ stda %f4, [$ivec + $omask]0xc0
+___
+$::code.=<<___;
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}cbc_dec_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_cbc_dec_blk_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ xor %g4, %o0, %o4 ! ^= rk[0]
+ xor %g5, %o1, %o5
+ movxtod %o4, %f0
+ movxtod %o5, %f2
+ xor %g4, %o2, %o4
+ xor %g5, %o3, %o5
+ movxtod %o4, %f4
+ movxtod %o5, %f6
+
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_decrypt_2x
+ add $inp, 32, $inp
+ subcc $len, 2, $len
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ fxor %f12, %f0, %f0 ! ^= ivec
+ fxor %f14, %f2, %f2
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_cbc_dec_loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_cbc_dec_loop2x
+ nop
+___
+$::code.=<<___ if ($::evp);
+ st %f12, [$ivec + 0] ! write out ivec
+ st %f13, [$ivec + 4]
+ st %f14, [$ivec + 8]
+ st %f15, [$ivec + 12]
+___
+$::code.=<<___ if (!$::evp);
+ brnz,pn $ivoff, 3b
+ nop
+
+ std %f12, [$ivec + 0] ! write out ivec
+ std %f14, [$ivec + 8]
+___
+$::code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_cbc_decrypt,#function
+.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
+___
+}
+
+sub alg_ctr32_implement {
+my ($alg,$bits) = @_;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_ctr32_encrypt
+.align 32
+${alg}${bits}_t4_ctr32_encrypt:
+ save %sp, -$::frame, %sp
+
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_enckey
+ sllx $len, 4, $len
+
+ ld [$ivec + 0], %l4 ! counter
+ ld [$ivec + 4], %l5
+ ld [$ivec + 8], %l6
+ ld [$ivec + 12], %l7
+
+ sllx %l4, 32, %o5
+ or %l5, %o5, %o5
+ sllx %l6, 32, %g1
+ xor %o5, %g4, %g4 ! ^= rk[0]
+ xor %g1, %g5, %g5
+ movxtod %g4, %f14 ! most significant 64 bits
+
+ sub $inp, $out, $blk_init ! $inp!=$out
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_ctr32_loop2x
+ srlx $len, 4, $len
+.L${bits}_ctr32_loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f4
+ aes_eround23 %f18, %f14, %f2, %f2
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f18, %f14, %f2, %f0
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_1x+8
+ add $inp, 16, $inp
+
+ movxtod %o0, %f10
+ movxtod %o1, %f12
+ fxor %f10, %f0, %f0 ! ^= inp
+ fxor %f12, %f2, %f2
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_ctr32_loop2x
+ add $out, 16, $out
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
+ orn %g0, $omask, $omask
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_ctr32_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ xor %g5, %l7, %g1
+ add %l7, 1, %l7
+ movxtod %g1, %f6
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f8
+ aes_eround23 %f18, %f14, %f2, %f2
+ aes_eround01 %f16, %f14, %f6, %f10
+ aes_eround23 %f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f16, %f6, %f14, %f6
+ camellia_f %f18, %f14, %f2, %f0
+ camellia_f %f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_2x+16
+ add $inp, 32, $inp
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ movxtod %o2, %f12
+ fxor %f8, %f0, %f0 ! ^= inp
+ movxtod %o3, %f8
+ fxor %f10, %f2, %f2
+ fxor %f12, %f4, %f4
+ fxor %f8, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_ctr32_loop2x
+ add $out, 32, $out
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f0
+ faligndata %f2, %f4, %f2
+ faligndata %f4, %f6, %f4
+ faligndata %f6, %f6, %f6
+
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f0, [$out + 8]
+ std %f2, [$out + 16]
+ std %f4, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f6, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
+ orn %g0, $omask, $omask
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_ctr32_blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_ctr32_blk_loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ xor %g5, %l7, %g1 ! ^= rk[0]
+ add %l7, 1, %l7
+ movxtod %g1, %f2
+ srl %l7, 0, %l7 ! clruw
+ xor %g5, %l7, %g1
+ add %l7, 1, %l7
+ movxtod %g1, %f6
+ srl %l7, 0, %l7 ! clruw
+ prefetch [$inp + 32+63], 20
+___
+$::code.=<<___ if ($alg eq "aes");
+ aes_eround01 %f16, %f14, %f2, %f8
+ aes_eround23 %f18, %f14, %f2, %f2
+ aes_eround01 %f16, %f14, %f6, %f10
+ aes_eround23 %f18, %f14, %f6, %f6
+___
+$::code.=<<___ if ($alg eq "cmll");
+ camellia_f %f16, %f2, %f14, %f2
+ camellia_f %f16, %f6, %f14, %f6
+ camellia_f %f18, %f14, %f2, %f0
+ camellia_f %f18, %f14, %f6, %f4
+___
+$::code.=<<___;
+ call _${alg}${bits}_encrypt_2x+16
+ add $inp, 32, $inp
+ subcc $len, 2, $len
+
+ movxtod %o0, %f8
+ movxtod %o1, %f10
+ movxtod %o2, %f12
+ fxor %f8, %f0, %f0 ! ^= inp
+ movxtod %o3, %f8
+ fxor %f10, %f2, %f2
+ fxor %f12, %f4, %f4
+ fxor %f8, %f6, %f6
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_ctr32_loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_ctr32_loop2x
+ nop
+
+ ret
+ restore
+.type ${alg}${bits}_t4_ctr32_encrypt,#function
+.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
+___
+}
+
+sub alg_xts_implement {
+my ($alg,$bits,$dir) = @_;
+my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
+my $rem=$ivec;
+
+$::code.=<<___;
+.globl ${alg}${bits}_t4_xts_${dir}crypt
+.align 32
+${alg}${bits}_t4_xts_${dir}crypt:
+ save %sp, -$::frame-16, %sp
+
+ mov $ivec, %o0
+ add %fp, $::bias-16, %o1
+ call ${alg}_t4_encrypt
+ mov $key2, %o2
+
+ add %fp, $::bias-16, %l7
+ ldxa [%l7]0x88, %g2
+ add %fp, $::bias-8, %l7
+ ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
+
+ sethi %hi(0x76543210), %l7
+ or %l7, %lo(0x76543210), %l7
+ bmask %l7, %g0, %g0 ! byte swap mask
+
+ prefetch [$inp], 20
+ prefetch [$inp + 63], 20
+ call _${alg}${bits}_load_${dir}ckey
+ and $len, 15, $rem
+ and $len, -16, $len
+___
+$code.=<<___ if ($dir eq "de");
+ mov 0, %l7
+ movrnz $rem, 16, %l7
+ sub $len, %l7, $len
+___
+$code.=<<___;
+
+ sub $inp, $out, $blk_init ! $inp!=$out
+ and $inp, 7, $ileft
+ andn $inp, 7, $inp
+ sll $ileft, 3, $ileft
+ mov 64, $iright
+ mov 0xff, $omask
+ sub $iright, $ileft, $iright
+ and $out, 7, $ooff
+ cmp $len, 255
+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
+ brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
+ srl $omask, $ooff, $omask
+
+ andcc $len, 16, %g0 ! is number of blocks even?
+___
+$code.=<<___ if ($dir eq "de");
+ brz,pn $len, .L${bits}_xts_${dir}steal
+___
+$code.=<<___;
+ alignaddrl $out, %g0, $out
+ bz %icc, .L${bits}_xts_${dir}loop2x
+ srlx $len, 4, $len
+.L${bits}_xts_${dir}loop:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 4f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+4:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 16+63], 20
+ call _${alg}${bits}_${dir}crypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ brnz,pn $ooff, 2f
+ sub $len, 1, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
+ add $out, 16, $out
+
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f4 ! handle unaligned output
+ faligndata %f0, %f2, %f6
+ faligndata %f2, %f2, %f8
+ stda %f4, [$out + $omask]0xc0 ! partial store
+ std %f6, [$out + 8]
+ add $out, 16, $out
+ orn %g0, $omask, $omask
+ stda %f8, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
+ orn %g0, $omask, $omask
+
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_xts_${dir}loop2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 4f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+4:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ xor %g4, %o2, %o2 ! ^= rk[0]
+ xor %g5, %o3, %o3
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+ movxtod %o2, %f4
+ movxtod %o3, %f6
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4 ! ^= tweak[0]
+ fxor %f10, %f6, %f6
+
+ prefetch [$out + 63], 22
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_${dir}crypt_2x
+ add $inp, 32, $inp
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ brnz,pn $ooff, 2f
+ sub $len, 2, $len
+
+ std %f0, [$out + 0]
+ std %f2, [$out + 8]
+ std %f4, [$out + 16]
+ std %f6, [$out + 24]
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
+ add $out, 32, $out
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+.align 16
+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
+ ! and ~3x deterioration
+ ! in inp==out case
+ faligndata %f0, %f0, %f8 ! handle unaligned output
+ faligndata %f0, %f2, %f10
+ faligndata %f2, %f4, %f12
+ faligndata %f4, %f6, %f14
+ faligndata %f6, %f6, %f0
+
+ stda %f8, [$out + $omask]0xc0 ! partial store
+ std %f10, [$out + 8]
+ std %f12, [$out + 16]
+ std %f14, [$out + 24]
+ add $out, 32, $out
+ orn %g0, $omask, $omask
+ stda %f0, [$out + $omask]0xc0 ! partial store
+
+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
+ orn %g0, $omask, $omask
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.align 32
+.L${bits}_xts_${dir}blk:
+ add $out, $len, $blk_init
+ and $blk_init, 63, $blk_init ! tail
+ sub $len, $blk_init, $len
+ add $blk_init, 15, $blk_init ! round up to 16n
+ srlx $len, 4, $len
+ srl $blk_init, 4, $blk_init
+ sub $len, 1, $len
+ add $blk_init, 1, $blk_init
+
+.L${bits}_xts_${dir}blk2x:
+ ldx [$inp + 0], %o0
+ ldx [$inp + 8], %o1
+ ldx [$inp + 16], %o2
+ brz,pt $ileft, 5f
+ ldx [$inp + 24], %o3
+
+ ldx [$inp + 32], %o4
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ or %g1, %o0, %o0
+ sllx %o1, $ileft, %o1
+ srlx %o2, $iright, %g1
+ or %g1, %o1, %o1
+ sllx %o2, $ileft, %o2
+ srlx %o3, $iright, %g1
+ or %g1, %o2, %o2
+ sllx %o3, $ileft, %o3
+ srlx %o4, $iright, %o4
+ or %o4, %o3, %o3
+5:
+ movxtod %g2, %f12
+ movxtod %g3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ xor %g4, %o2, %o2 ! ^= rk[0]
+ xor %g5, %o3, %o3
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+ movxtod %o2, %f4
+ movxtod %o3, %f6
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4 ! ^= tweak[0]
+ fxor %f10, %f6, %f6
+
+ prefetch [$inp + 32+63], 20
+ call _${alg}${bits}_${dir}crypt_2x
+ add $inp, 32, $inp
+
+ movxtod %g2, %f8
+ movxtod %g3, %f10
+
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %g2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %g3
+ xor %l7, %g2, %g2
+
+ bshuffle %f8, %f8, %f8
+ bshuffle %f10, %f10, %f10
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+ fxor %f8, %f4, %f4
+ fxor %f10, %f6, %f6
+
+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ add $out, 8, $out
+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
+ bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
+ add $out, 8, $out
+
+ add $blk_init, $len, $len
+ andcc $len, 1, %g0 ! is number of blocks even?
+ membar #StoreLoad|#StoreStore
+ bnz,pt %icc, .L${bits}_xts_${dir}loop
+ srl $len, 0, $len
+ brnz,pn $len, .L${bits}_xts_${dir}loop2x
+ nop
+
+ fsrc2 %f4, %f0
+ fsrc2 %f6, %f2
+ brnz,pn $rem, .L${bits}_xts_${dir}steal
+ nop
+
+ ret
+ restore
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+___
+$code.=<<___ if ($dir eq "en");
+.align 32
+.L${bits}_xts_${dir}steal:
+ std %f0, [%fp + $::bias-16] ! copy of output
+ std %f2, [%fp + $::bias-8]
+
+ srl $ileft, 3, $ileft
+ add %fp, $::bias-16, %l7
+ add $inp, $ileft, $inp ! original $inp+$len&-15
+ add $out, $ooff, $out ! original $out+$len&-15
+ mov 0, $ileft
+ nop ! align
+
+.L${bits}_xts_${dir}stealing:
+ ldub [$inp + $ileft], %o0
+ ldub [%l7 + $ileft], %o1
+ dec $rem
+ stb %o0, [%l7 + $ileft]
+ stb %o1, [$out + $ileft]
+ brnz $rem, .L${bits}_xts_${dir}stealing
+ inc $ileft
+
+ mov %l7, $inp
+ sub $out, 16, $out
+ mov 0, $ileft
+ sub $out, $ooff, $out
+ ba .L${bits}_xts_${dir}loop ! one more time
+ mov 1, $len ! $rem is 0
+___
+$code.=<<___ if ($dir eq "de");
+.align 32
+.L${bits}_xts_${dir}steal:
+ ldx [$inp + 0], %o0
+ brz,pt $ileft, 8f
+ ldx [$inp + 8], %o1
+
+ ldx [$inp + 16], %o2
+ sllx %o0, $ileft, %o0
+ srlx %o1, $iright, %g1
+ sllx %o1, $ileft, %o1
+ or %g1, %o0, %o0
+ srlx %o2, $iright, %o2
+ or %o2, %o1, %o1
+8:
+ srax %g3, 63, %l7 ! next tweak value
+ addcc %g2, %g2, %o2
+ and %l7, 0x87, %l7
+ addxc %g3, %g3, %o3
+ xor %l7, %o2, %o2
+
+ movxtod %o2, %f12
+ movxtod %o3, %f14
+ bshuffle %f12, %f12, %f12
+ bshuffle %f14, %f14, %f14
+
+ xor %g4, %o0, %o0 ! ^= rk[0]
+ xor %g5, %o1, %o1
+ movxtod %o0, %f0
+ movxtod %o1, %f2
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ call _${alg}${bits}_${dir}crypt_1x
+ add $inp, 16, $inp
+
+ fxor %f12, %f0, %f0 ! ^= tweak[0]
+ fxor %f14, %f2, %f2
+
+ std %f0, [%fp + $::bias-16]
+ std %f2, [%fp + $::bias-8]
+
+ srl $ileft, 3, $ileft
+ add %fp, $::bias-16, %l7
+ add $inp, $ileft, $inp ! original $inp+$len&-15
+ add $out, $ooff, $out ! original $out+$len&-15
+ mov 0, $ileft
+ add $out, 16, $out
+ nop ! align
+
+.L${bits}_xts_${dir}stealing:
+ ldub [$inp + $ileft], %o0
+ ldub [%l7 + $ileft], %o1
+ dec $rem
+ stb %o0, [%l7 + $ileft]
+ stb %o1, [$out + $ileft]
+ brnz $rem, .L${bits}_xts_${dir}stealing
+ inc $ileft
+
+ mov %l7, $inp
+ sub $out, 16, $out
+ mov 0, $ileft
+ sub $out, $ooff, $out
+ ba .L${bits}_xts_${dir}loop ! one more time
+ mov 1, $len ! $rem is 0
+___
+$code.=<<___;
+ ret
+ restore
+.type ${alg}${bits}_t4_xts_${dir}crypt,#function
+.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
+___
+}
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis {
+my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
+my ($ref,$opf);
+my %visopf = ( "faligndata" => 0x048,
+ "bshuffle" => 0x04c,
+ "fnot2" => 0x066,
+ "fxor" => 0x06c,
+ "fsrc2" => 0x078 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = ( "addxc" => 0x011,
+ "addxccc" => 0x013,
+ "umulxhi" => 0x016,
+ "alignaddr" => 0x018,
+ "bmask" => 0x019,
+ "alignaddrl" => 0x01a );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if ($opf=$visopf{$mnemonic}) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%([goli])([0-9])/);
+ $_=$bias{$1}+$2;
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unaes_round { # 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)[email protected]_;
+my ($ref,$opf);
+my %aesopf = ( "aes_eround01" => 0,
+ "aes_eround23" => 1,
+ "aes_dround01" => 2,
+ "aes_dround23" => 3,
+ "aes_eround01_l"=> 4,
+ "aes_eround23_l"=> 5,
+ "aes_dround01_l"=> 6,
+ "aes_dround23_l"=> 7,
+ "aes_kexpand1" => 8 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unaes_kexpand { # 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
+my ($ref,$opf);
+my %aesopf = ( "aes_kexpand0" => 0x130,
+ "aes_kexpand2" => 0x131 );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if (defined($opf=$aesopf{$mnemonic})) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub uncamellia_f { # 4-argument instructions
+my ($mnemonic,$rs1,$rs2,$rs3,$rd)[email protected]_;
+my ($ref,$opf);
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
+
+ if (1) {
+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub uncamellia3 { # 3-argument instructions
+my ($mnemonic,$rs1,$rs2,$rd)[email protected]_;
+my ($ref,$opf);
+my %cmllopf = ( "camellia_fl" => 0x13c,
+ "camellia_fli" => 0x13d );
+
+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+ if (defined($opf=$cmllopf{$mnemonic})) {
+ foreach ($rs1,$rs2,$rd) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub unmovxtox { # 2-argument instructions
+my ($mnemonic,$rs,$rd)[email protected]_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
+my ($ref,$opf);
+my %movxopf = ( "movdtox" => 0x110,
+ "movstouw" => 0x111,
+ "movstosw" => 0x113,
+ "movxtod" => 0x118,
+ "movwtos" => 0x119 );
+
+ $ref = "$mnemonic\t$rs,$rd";
+
+ if (defined($opf=$movxopf{$mnemonic})) {
+ foreach ($rs,$rd) {
+ return $ref if (!/%([fgoli])([0-9]{1,2})/);
+ $_=$bias{$1}+$2;
+ if ($2>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($2|$2>>5)&31;
+ }
+ }
+
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
+ $ref;
+ } else {
+ return $ref;
+ }
+}
+
+sub undes {
+my ($mnemonic)=shift;
+my @[email protected]_;
+my ($ref,$opf);
+my %desopf = ( "des_round" => 0b1001,
+ "des_ip" => 0b100110100,
+ "des_iip" => 0b100110101,
+ "des_kexpand" => 0b100110110 );
+
+ $ref = "$mnemonic\t".join(",",@_);
+
+ if (defined($opf=$desopf{$mnemonic})) { # 4-arg
+ if ($mnemonic eq "des_round") {
+ foreach (@args[0..3]) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($1&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
+ $ref;
+ } elsif ($mnemonic eq "des_kexpand") { # 3-arg
+ foreach (@args[0..2]) {
+ return $ref if (!/(%f)?([0-9]{1,2})/);
+ $_=$2;
+ if ($2>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($2|$2>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
+ $ref;
+ } else { # 2-arg
+ foreach (@args[0..1]) {
+ return $ref if (!/%f([0-9]{1,2})/);
+ $_=$1;
+ if ($1>=32) {
+ return $ref if ($2&1);
+ # re-encode for upper double register addressing
+ $_=($1|$1>>5)&31;
+ }
+ }
+ return sprintf ".word\t0x%08x !%s",
+ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
+ $ref;
+ }
+ } else {
+ return $ref;
+ }
+}
+
+sub emit_assembler {
+ foreach (split("\n",$::code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
+
+ s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+ &unaes_round($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unaes_kexpand($1,$2,$3,$4)
+ /geo or
+ s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
+ &uncamellia_f($1,$2,$3,$4,$5)
+ /geo or
+ s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &uncamellia3($1,$2,$3,$4)
+ /geo or
+ s/\b(des_\w+)\s+(?<rs1>%f[0-9]{1,2}),\s*(?<rs2>[%fx0-9]+)(,\s*(?<rs3>%f[0-9]{1,2})(,\s*(?<rs4>%f[0-9]{1,2}))?)?/
+ &undes($1,$+{rs1},$+{rs2},$+{rs3},$+{rs4})
+ /geo or
+ s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
+ &unmovxtox($1,$2,$3)
+ /geo or
+ s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
+ &unmovxtox($1,$2,$3)
+ /geo or
+ s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
+ &unvis($1,$2,$3,$4)
+ /geo or
+ s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+ &unvis3($1,$2,$3,$4)
+ /geo;
+
+ print $_,"\n";
+ }
+}
+
+1;
--- a/components/openssl/openssl-1.0.1/patches/31_dtls_version.patch Wed Jul 17 00:17:02 2013 -0700
+++ b/components/openssl/openssl-1.0.1/patches/31_dtls_version.patch Wed Jul 17 15:19:38 2013 -0700
@@ -8,4 +8,4 @@
+ if (s->version >= TLS1_1_VERSION || s->version == DTLS1_BAD_VER)
{
/* These lengths are all public so we can test them in
- * non-constant time.
+ * non-constant time.
--- a/components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch Wed Jul 17 00:17:02 2013 -0700
+++ b/components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch Wed Jul 17 15:19:38 2013 -0700
@@ -11,10 +11,10 @@
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
--my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
-my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
-+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
-+my $sparcv9_fips_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_fips_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void";
+my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o t4_des.o:t4_aes.o::t4_md5.o:t4_sha1.o t4_sha2.o:::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
@@ -49,9 +49,9 @@
GENERAL=Makefile
#TEST=aestest.c
TEST=
[email protected]@ -69,6 +73,10 @@
- aes-sparcv9.s: asm/aes-sparcv9.pl
- $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > [email protected]
[email protected]@ -72,6 +76,10 @@
+ aest4-sparcv9.s: asm/aest4-sparcv9.pl
+ $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > [email protected]
+t4_aes.o: asm/t4_aes.S
+ as $(ASFLAGSYF) -o [email protected] asm/t4_aes.S
--- a/components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch Wed Jul 17 00:17:02 2013 -0700
+++ b/components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch Wed Jul 17 15:19:38 2013 -0700
@@ -1,5 +1,5 @@
#
-# This file addds inline T4 instruction support to OpenSSL upstream code.
+# This file adds inline T4 instruction support to OpenSSL upstream code.
#
Index: Configure
===================================================================
@@ -11,7 +11,7 @@
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
-+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
@@ -852,3 +852,664 @@
d=Time_F(STOP);
print_result(D_SHA1,j,count,d);
}
+Index: openssl/crypto/aes/Makefile
+===================================================================
+--- Makefile Thu May 2 13:42:37 2013
++++ Makefile.orig Thu May 2 13:41:51 2013
[email protected]@ -69,6 +69,9 @@
+ aes-sparcv9.s: asm/aes-sparcv9.pl
+ $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > [email protected]
+
++aest4-sparcv9.s: asm/aest4-sparcv9.pl
++ $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > [email protected]
++
+ aes-ppc.s: asm/aes-ppc.pl
+ $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) [email protected]
+
+Index: openssl/crypto/evp/e_aes.c
+===================================================================
+--- e_aes.c Mon Feb 11 07:26:04 2013
++++ e_aes.c.56 Thu May 2 14:26:35 2013
[email protected]@ -56,13 +58,12 @@
+ #include <assert.h>
+ #include <openssl/aes.h>
+ #include "evp_locl.h"
+-#ifndef OPENSSL_FIPS
+ #include "modes_lcl.h"
+ #include <openssl/rand.h>
+
+ typedef struct
+ {
+- AES_KEY ks;
++ union { double align; AES_KEY ks; } ks;
+ block128_f block;
+ union {
+ cbc128_f cbc;
[email protected]@ -72,7 +73,7 @@
+
+ typedef struct
+ {
+- AES_KEY ks; /* AES key schedule to use */
++ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ GCM128_CONTEXT gcm;
[email protected]@ -86,7 +87,7 @@
+
+ typedef struct
+ {
+- AES_KEY ks1, ks2; /* AES key schedules to use */
++ union { double align; AES_KEY ks; } ks1, ks2; /* AES key schedules to use */
+ XTS128_CONTEXT xts;
+ void (*stream)(const unsigned char *in,
+ unsigned char *out, size_t length,
[email protected]@ -96,7 +97,7 @@
+
+ typedef struct
+ {
+- AES_KEY ks; /* AES key schedule to use */
++ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ int tag_set; /* Set if tag is valid */
[email protected]@ -160,7 +161,7 @@
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) )
+
+-extern unsigned int OPENSSL_ia32cap_P[2];
++extern unsigned int OPENSSL_ia32cap_P[];
+
+ #ifdef VPAES_ASM
+ #define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
[email protected]@ -310,7 +311,7 @@
+ return 1;
+ if (key)
+ {
+- aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
++ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f)aesni_encrypt);
+ gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
[email protected]@ -355,19 +356,19 @@
+ /* key_len is two AES keys */
+ if (enc)
+ {
+- aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
++ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)aesni_encrypt;
+ xctx->stream = aesni_xts_encrypt;
+ }
+ else
+ {
+- aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
++ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)aesni_decrypt;
+ xctx->stream = aesni_xts_decrypt;
+ }
+
+ aesni_set_encrypt_key(key + ctx->key_len/2,
+- ctx->key_len * 4, &xctx->ks2);
++ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f)aesni_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
[email protected]@ -394,7 +395,7 @@
+ return 1;
+ if (key)
+ {
+- aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
++ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)aesni_encrypt);
+ cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
[email protected]@ -456,6 +457,379 @@
+ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+ { return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
++#elif defined(AES_ASM) && (defined(__sparc) || defined(__sparc__))
++
++#include "sparc_arch.h"
++
++extern unsigned int OPENSSL_sparcv9cap_P[];
++
++#define SPARC_AES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_AES)
++
++void aes_t4_set_encrypt_key (const unsigned char *key, int bits,
++ AES_KEY *ks);
++void aes_t4_set_decrypt_key (const unsigned char *key, int bits,
++ AES_KEY *ks);
++void aes_t4_encrypt (const unsigned char *in, unsigned char *out,
++ const AES_KEY *key);
++void aes_t4_decrypt (const unsigned char *in, unsigned char *out,
++ const AES_KEY *key);
++/*
++ * Key-length specific subroutines were chosen for following reason.
++ * Each SPARC T4 core can execute up to 8 threads which share core's
++ * resources. Loading as much key material to registers allows to
++ * minimize references to shared memory interface, as well as amount
++ * of instructions in inner loops [much needed on T4]. But then having
++ * non-key-length specific routines would require conditional branches
++ * either in inner loops or on subroutines' entries. Former is hardly
++ * acceptable, while latter means code size increase to size occupied
++ * by multiple key-length specfic subroutines, so why fight?
++ */
++void aes128_t4_cbc_encrypt (const unsigned char *in, unsigned char *out,
++ size_t len, const AES_KEY *key,
++ unsigned char *ivec);
++void aes128_t4_cbc_decrypt (const unsigned char *in, unsigned char *out,
++ size_t len, const AES_KEY *key,
++ unsigned char *ivec);
++void aes192_t4_cbc_encrypt (const unsigned char *in, unsigned char *out,
++ size_t len, const AES_KEY *key,
++ unsigned char *ivec);
++void aes192_t4_cbc_decrypt (const unsigned char *in, unsigned char *out,
++ size_t len, const AES_KEY *key,
++ unsigned char *ivec);
++void aes256_t4_cbc_encrypt (const unsigned char *in, unsigned char *out,
++ size_t len, const AES_KEY *key,
++ unsigned char *ivec);
++void aes256_t4_cbc_decrypt (const unsigned char *in, unsigned char *out,
++ size_t len, const AES_KEY *key,
++ unsigned char *ivec);
++void aes128_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out,
++ size_t blocks, const AES_KEY *key,
++ unsigned char *ivec);
++void aes192_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out,
++ size_t blocks, const AES_KEY *key,
++ unsigned char *ivec);
++void aes256_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out,
++ size_t blocks, const AES_KEY *key,
++ unsigned char *ivec);
++
++static int aes_t4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
++ const unsigned char *iv, int enc)
++ {
++ int ret, mode, bits;
++ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
++
++ mode = ctx->cipher->flags & EVP_CIPH_MODE;
++ bits = ctx->key_len*8;
++ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
++ && !enc)
++ {
++ ret = 0;
++ aes_t4_set_decrypt_key(key, bits, ctx->cipher_data);
++ dat->block = (block128_f)aes_t4_decrypt;
++ switch (bits) {
++ case 128:
++ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
++ (cbc128_f)aes128_t4_cbc_decrypt :
++ NULL;
++ break;
++ case 192:
++ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
++ (cbc128_f)aes192_t4_cbc_decrypt :
++ NULL;
++ break;
++ case 256:
++ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
++ (cbc128_f)aes256_t4_cbc_decrypt :
++ NULL;
++ break;
++ default:
++ ret = -1;
++ }
++ }
++ else {
++ ret = 0;
++ aes_t4_set_encrypt_key(key, bits, ctx->cipher_data);
++ dat->block = (block128_f)aes_t4_encrypt;
++ switch (bits) {
++ case 128:
++ if (mode==EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f)aes128_t4_cbc_encrypt;
++ else if (mode==EVP_CIPH_CTR_MODE)
++ dat->stream.ctr = (ctr128_f)aes128_t4_ctr32_encrypt;
++ else
++ dat->stream.cbc = NULL;
++ break;
++ case 192:
++ if (mode==EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f)aes192_t4_cbc_encrypt;
++ else if (mode==EVP_CIPH_CTR_MODE)
++ dat->stream.ctr = (ctr128_f)aes192_t4_ctr32_encrypt;
++ else
++ dat->stream.cbc = NULL;
++ break;
++ case 256:
++ if (mode==EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f)aes256_t4_cbc_encrypt;
++ else if (mode==EVP_CIPH_CTR_MODE)
++ dat->stream.ctr = (ctr128_f)aes256_t4_ctr32_encrypt;
++ else
++ dat->stream.cbc = NULL;
++ break;
++ default:
++ ret = -1;
++ }
++ }
++
++ if(ret < 0)
++ {
++ EVPerr(EVP_F_AES_T4_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
++ return 0;
++ }
++
++ return 1;
++ }
++
++#define aes_t4_cbc_cipher aes_cbc_cipher
++static int aes_t4_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
++ const unsigned char *in, size_t len);
++
++#define aes_t4_ecb_cipher aes_ecb_cipher
++static int aes_t4_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
++ const unsigned char *in, size_t len);
++
++#define aes_t4_ofb_cipher aes_ofb_cipher
++static int aes_t4_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
++ const unsigned char *in,size_t len);
++
++#define aes_t4_cfb_cipher aes_cfb_cipher
++static int aes_t4_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
++ const unsigned char *in,size_t len);
++
++#define aes_t4_cfb8_cipher aes_cfb8_cipher
++static int aes_t4_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
++ const unsigned char *in,size_t len);
++
++#define aes_t4_cfb1_cipher aes_cfb1_cipher
++static int aes_t4_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
++ const unsigned char *in,size_t len);
++
++#define aes_t4_ctr_cipher aes_ctr_cipher
++static int aes_t4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len);
++
++static int aes_t4_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
++ const unsigned char *iv, int enc)
++ {
++ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
++ if (!iv && !key)
++ return 1;
++ if (key)
++ {
++ int bits = ctx->key_len * 8;
++ aes_t4_set_encrypt_key(key, bits, &gctx->ks.ks);
++ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
++ (block128_f)aes_t4_encrypt);
++ switch (bits) {
++ case 128:
++ gctx->ctr = (ctr128_f)aes128_t4_ctr32_encrypt;
++ break;
++ case 192:
++ gctx->ctr = (ctr128_f)aes192_t4_ctr32_encrypt;
++ break;
++ case 256:
++ gctx->ctr = (ctr128_f)aes256_t4_ctr32_encrypt;
++ break;
++ default:
++ return 0;
++ }
++ /* If we have an iv can set it directly, otherwise use
++ * saved IV.
++ */
++ if (iv == NULL && gctx->iv_set)
++ iv = gctx->iv;
++ if (iv)
++ {
++ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
++ gctx->iv_set = 1;
++ }
++ gctx->key_set = 1;
++ }
++ else
++ {
++ /* If key set use IV, otherwise copy */
++ if (gctx->key_set)
++ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
++ else
++ memcpy(gctx->iv, iv, gctx->ivlen);
++ gctx->iv_set = 1;
++ gctx->iv_gen = 0;
++ }
++ return 1;
++ }
++
++#define aes_t4_gcm_cipher aes_gcm_cipher
++static int aes_t4_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len);
++
++static int aes_t4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
++ const unsigned char *iv, int enc)
++ {
++ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
++ if (!iv && !key)
++ return 1;
++
++ if (key)
++ {
++ int bits = ctx->key_len * 4;
++ /* key_len is two AES keys */
++ if (enc)
++ {
++ aes_t4_set_encrypt_key(key, bits, &xctx->ks1.ks);
++ xctx->xts.block1 = (block128_f)aes_t4_encrypt;
++#if 0 /* not yet */
++ switch (bits) {
++ case 128:
++ xctx->stream = aes128_t4_xts_encrypt;
++ break;
++ case 192:
++ xctx->stream = aes192_t4_xts_encrypt;
++ break;
++ case 256:
++ xctx->stream = aes256_t4_xts_encrypt;
++ break;
++ default:
++ return 0;
++ }
++#endif
++ }
++ else
++ {
++ aes_t4_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
++ xctx->xts.block1 = (block128_f)aes_t4_decrypt;
++#if 0 /* not yet */
++ switch (bits) {
++ case 128:
++ xctx->stream = aes128_t4_xts_decrypt;
++ break;
++ case 192:
++ xctx->stream = aes192_t4_xts_decrypt;
++ break;
++ case 256:
++ xctx->stream = aes256_t4_xts_decrypt;
++ break;
++ default:
++ return 0;
++ }
++#endif
++ }
++
++ aes_t4_set_encrypt_key(key + ctx->key_len/2,
++ ctx->key_len * 4, &xctx->ks2.ks);
++ xctx->xts.block2 = (block128_f)aes_t4_encrypt;
++
++ xctx->xts.key1 = &xctx->ks1;
++ }
++
++ if (iv)
++ {
++ xctx->xts.key2 = &xctx->ks2;
++ memcpy(ctx->iv, iv, 16);
++ }
++
++ return 1;
++ }
++
++#define aes_t4_xts_cipher aes_xts_cipher
++static int aes_t4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len);
++
++static int aes_t4_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
++ const unsigned char *iv, int enc)
++ {
++ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
++ if (!iv && !key)
++ return 1;
++ if (key)
++ {
++ int bits = ctx->key_len * 8;
++ aes_t4_set_encrypt_key(key, bits, &cctx->ks.ks);
++ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
++ &cctx->ks, (block128_f)aes_t4_encrypt);
++#if 0 /* not yet */
++ switch (bits) {
++ case 128:
++ cctx->str = enc?(ccm128_f)aes128_t4_ccm64_encrypt :
++ (ccm128_f)ae128_t4_ccm64_decrypt;
++ break;
++ case 192:
++ cctx->str = enc?(ccm128_f)aes192_t4_ccm64_encrypt :
++ (ccm128_f)ae192_t4_ccm64_decrypt;
++ break;
++ case 256:
++ cctx->str = enc?(ccm128_f)aes256_t4_ccm64_encrypt :
++ (ccm128_f)ae256_t4_ccm64_decrypt;
++ break;
++ default:
++ return 0;
++ }
++#endif
++ cctx->key_set = 1;
++ }
++ if (iv)
++ {
++ memcpy(ctx->iv, iv, 15 - cctx->L);
++ cctx->iv_set = 1;
++ }
++ return 1;
++ }
++
++#define aes_t4_ccm_cipher aes_ccm_cipher
++static int aes_t4_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len);
++
++#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
++static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
++ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
++ flags|EVP_CIPH_##MODE##_MODE, \
++ aes_t4_init_key, \
++ aes_t4_##mode##_cipher, \
++ NULL, \
++ sizeof(EVP_AES_KEY), \
++ NULL,NULL,NULL,NULL }; \
++static const EVP_CIPHER aes_##keylen##_##mode = { \
++ nid##_##keylen##_##nmode,blocksize, \
++ keylen/8,ivlen, \
++ flags|EVP_CIPH_##MODE##_MODE, \
++ aes_init_key, \
++ aes_##mode##_cipher, \
++ NULL, \
++ sizeof(EVP_AES_KEY), \
++ NULL,NULL,NULL,NULL }; \
++const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
++{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
++
++#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
++static const EVP_CIPHER aes_t4_##keylen##_##mode = { \
++ nid##_##keylen##_##mode,blocksize, \
++ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
++ flags|EVP_CIPH_##MODE##_MODE, \
++ aes_t4_##mode##_init_key, \
++ aes_t4_##mode##_cipher, \
++ aes_##mode##_cleanup, \
++ sizeof(EVP_AES_##MODE##_CTX), \
++ NULL,NULL,aes_##mode##_ctrl,NULL }; \
++static const EVP_CIPHER aes_##keylen##_##mode = { \
++ nid##_##keylen##_##mode,blocksize, \
++ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
++ flags|EVP_CIPH_##MODE##_MODE, \
++ aes_##mode##_init_key, \
++ aes_##mode##_cipher, \
++ aes_##mode##_cleanup, \
++ sizeof(EVP_AES_##MODE##_CTX), \
++ NULL,NULL,aes_##mode##_ctrl,NULL }; \
++const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
++{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; }
++
+ #else
+
+ #define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
[email protected]@ -505,7 +879,7 @@
+ #ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+ {
+- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
++ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)AES_decrypt;
+ dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt;
+ }
[email protected]@ -514,7 +888,7 @@
+ #ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+- ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
++ ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)vpaes_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)vpaes_cbc_encrypt :
[email protected]@ -523,7 +897,7 @@
+ else
+ #endif
+ {
+- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
++ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)AES_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)AES_cbc_encrypt :
[email protected]@ -533,7 +907,7 @@
+ #ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
+ {
+- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
++ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)AES_encrypt;
+ dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+ }
[email protected]@ -542,7 +916,7 @@
+ #ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+- ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
++ ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)vpaes_encrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)vpaes_cbc_encrypt :
[email protected]@ -551,7 +925,7 @@
+ else
+ #endif
+ {
+- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
++ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks);
+ dat->block = (block128_f)AES_encrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)AES_cbc_encrypt :
[email protected]@ -825,7 +1199,7 @@
+ #ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE)
+ {
+- AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
++ AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+ (block128_f)AES_encrypt);
+ gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
[email protected]@ -836,7 +1210,7 @@
+ #ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+- vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
++ vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+ (block128_f)vpaes_encrypt);
+ gctx->ctr = NULL;
[email protected]@ -843,7 +1217,7 @@
+ break;
+ }
+ #endif
+- AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
++ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+ #ifdef AES_CTR_ASM
+ gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
[email protected]@ -1074,17 +1448,17 @@
+ {
+ if (enc)
+ {
+- vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
++ vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)vpaes_encrypt;
+ }
+ else
+ {
+- vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
++ vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)vpaes_decrypt;
+ }
+
+ vpaes_set_encrypt_key(key + ctx->key_len/2,
+- ctx->key_len * 4, &xctx->ks2);
++ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f)vpaes_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
[email protected]@ -1093,17 +1467,17 @@
+ #endif
+ if (enc)
+ {
+- AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
++ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)AES_encrypt;
+ }
+ else
+ {
+- AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
++ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks);
+ xctx->xts.block1 = (block128_f)AES_decrypt;
+ }
+
+ AES_set_encrypt_key(key + ctx->key_len/2,
+- ctx->key_len * 4, &xctx->ks2);
++ ctx->key_len * 4, &xctx->ks2.ks);
+ xctx->xts.block2 = (block128_f)AES_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
[email protected]@ -1214,7 +1588,7 @@
+ #ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+- vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
++ vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)vpaes_encrypt);
+ cctx->str = NULL;
[email protected]@ -1222,7 +1596,7 @@
+ break;
+ }
+ #endif
+- AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
++ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)AES_encrypt);
+ cctx->str = NULL;
[email protected]@ -1310,5 +1684,4 @@
+ BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+ BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+
+-#endif
+ #endif
+Index: openssl/crypto/evp/evp.h
+===================================================================
+--- evp.h Mon Feb 11 07:26:04 2013
++++ evp.h.new Thu May 2 14:31:55 2013
[email protected]@ -1256,6 +1256,7 @@
+ #define EVP_F_AESNI_INIT_KEY 165
+ #define EVP_F_AESNI_XTS_CIPHER 176
+ #define EVP_F_AES_INIT_KEY 133
++#define EVP_F_AES_T4_INIT_KEY 178
+ #define EVP_F_AES_XTS 172
+ #define EVP_F_AES_XTS_CIPHER 175
+ #define EVP_F_ALG_MODULE_INIT 177
+Index: openssl/crypto/evp/evp_err.c
+===================================================================
+--- evp_err.c Mon Feb 11 07:26:04 2013
++++ evp_err.c.new Thu May 2 14:33:24 2013
[email protected]@ -73,6 +73,7 @@
+ {ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"},
+ {ERR_FUNC(EVP_F_AESNI_XTS_CIPHER), "AESNI_XTS_CIPHER"},
+ {ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"},
++{ERR_FUNC(EVP_F_AES_T4_INIT_KEY), "AES_T4_INIT_KEY"},
+ {ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
+ {ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
+ {ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},