# HG changeset patch # User Misaki Miyashita # Date 1374099578 25200 # Node ID f7ee98f5749eb8998de97a04834dea898cfa64ae # Parent 70e041ba5b04b20b201a8b8f85b995b55a50b268 PSARC 2013/034 OpenSSL 1.0.1 15824598 SUNBT7206150 T4 AES should be embedded in the OpenSSL upstream src diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/Makefile --- a/components/openssl/openssl-1.0.1/Makefile Wed Jul 17 00:17:02 2013 -0700 +++ b/components/openssl/openssl-1.0.1/Makefile Wed Jul 17 15:19:38 2013 -0700 @@ -181,7 +181,9 @@ $(LN) -fs $(COMPONENT_DIR)/engines/t4/t4_sha?.S $(@D)/crypto/sha/asm; \ $(LN) -fs $(COMPONENT_DIR)/wanboot-openssl/wanboot-stubs.c $(@D)/crypto; \ $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparc_arch.h $(@D)/crypto/; \ - $(LN) -fs $(COMPONENT_DIR)/inline-t4/md5-sparcv9.pl $(@D)/crypto/md5/asm; ) + $(LN) -fs $(COMPONENT_DIR)/inline-t4/md5-sparcv9.pl $(@D)/crypto/md5/asm; \ + $(LN) -fs $(COMPONENT_DIR)/inline-t4/aest4-sparcv9.pl $(@D)/crypto/aes/asm; \ + $(LN) -fs $(COMPONENT_DIR)/inline-t4/sparcv9_modes.pl $(@D)/crypto/perlasm; ) # OpenSSL for wanboot is built on sparc only. @@ -194,7 +196,8 @@ # Object files for wanboot-openssl.o have to be listed explicitly. WANBOOT_OBJS = \ crypto/aes/aes-sparcv9.o crypto/aes/aes_cbc.o crypto/aes/aes_core.o \ - crypto/aes/aes_misc.o crypto/aes/aes_wrap.o crypto/asn1/a_bitstr.o \ + crypto/aes/aes_misc.o crypto/aes/aes_wrap.o crypto/aes/aest4-sparcv9.o \ + crypto/asn1/a_bitstr.o \ crypto/asn1/a_bool.o crypto/asn1/a_bytes.o crypto/asn1/a_d2i_fp.o \ crypto/asn1/a_digest.o crypto/asn1/a_dup.o crypto/asn1/a_enum.o \ crypto/asn1/a_gentm.o crypto/asn1/a_i2d_fp.o crypto/asn1/a_int.o \ diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c --- a/components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c Wed Jul 17 00:17:02 2013 -0700 +++ b/components/openssl/openssl-1.0.1/engines/pkcs11/hw_pk11.c Wed Jul 17 15:19:38 2013 -0700 @@ -331,8 +331,12 @@ #ifdef SOLARIS_HW_SLOT_SELECTION static int check_hw_mechanisms(void); static int nid_in_table(int nid, int *nid_table); -static int hw_aes_instruction_set_present(void); +#if defined(__amd64) || defined(__i386) +static int hw_x86_aes_instruction_set_present(void); +#endif #if defined(__sparc) +static int hw_yf_aes_instruction_set_present(void); +static int hw_fj_aes_instruction_set_present(void); static int hw_yf_digest_instruction_present(void); #endif #endif /* SOLARIS_HW_SLOT_SELECTION */ @@ -2650,6 +2654,30 @@ if (!cipher) return (pk11_usable_ciphers(nids)); +#ifdef __sparc + /* + * If T4 AES instructions are present, don't advertise + * the AES mechanisms for pkcs11 engine as AES operations + * should be accelerated by the inline T4 instructions + * in the OpenSSL upstream code. + */ + if (hw_yf_aes_instruction_set_present() == 1) { + switch (nid) { + case NID_aes_128_cbc: + case NID_aes_192_cbc: + case NID_aes_256_cbc: + case NID_aes_128_ecb: + case NID_aes_192_ecb: + case NID_aes_256_ecb: + case NID_aes_128_ctr: + case NID_aes_192_ctr: + case NID_aes_256_ctr: + *cipher = NULL; + return (0); + } + } +#endif + switch (nid) { case NID_des_ede3_cbc: @@ -3487,6 +3515,21 @@ for (i = 0; i < PK11_CIPHER_MAX; ++i) { +#ifdef __sparc + /* + * if T4 AES instruction is present, don't include AES mechanism + * in the supported symmetric cipher list. + */ + if (hw_yf_aes_instruction_set_present() == 1) { + switch (ciphers[i].mech_type) { + case CKM_AES_CBC: + case CKM_AES_ECB: + case CKM_AES_CTR: + continue; + } + } +#endif + pk11_get_symmetric_cipher(pflist, current_slot, current_slot_n_cipher, local_cipher_nids, &ciphers[i]); } @@ -3738,19 +3781,14 @@ if (nid_table == NULL) return (1); +#if defined(__x86) /* - * If we have an AES instruction set on SPARC we route everything - * through the Crypto Framework (ie., through pkcs11_softtoken in this - * case). This is for T4 which has HW instructions for AES, DES, MD5, - * SHA1, SHA256, SHA512, MONTMUL, and MPMUL. - * * On Intel, if we have AES-NI instruction set we route AES to the * Crypto Framework. Intel CPUs do not have other instruction sets for * HW crypto acceleration so we check the HW NID table for any other * mechanism. */ -#if defined(__x86) - if (hw_aes_instruction_set_present() == 1) + if (hw_x86_aes_instruction_set_present() == 1) { switch (nid) { @@ -3760,21 +3798,24 @@ case NID_aes_128_cbc: case NID_aes_192_cbc: case NID_aes_256_cbc: - return (1); - } - /* - * These are variables, cannot be used as case expressions. - */ - if (nid == NID_aes_128_ctr || - nid == NID_aes_192_ctr || - nid == NID_aes_256_ctr) - { + case NID_aes_128_ctr: + case NID_aes_192_ctr: + case NID_aes_256_ctr: return (1); } } #elif defined(__sparc) - if (hw_aes_instruction_set_present() == 1) + /* + * If we have a T4 AES instruction set on SPARC, we won't process AES in + * the Crypto Framework so that the job can be process directly using + * the inline AES instruction. This is for T4 which has HW instructions + * for AES, DES, MD5, SHA1, SHA256, SHA512, MONTMUL, and MPMUL. + */ + if (hw_yf_aes_instruction_set_present() == 1) { + return (0); + } else if (hw_fj_aes_instruction_set_present() == 1) { return (1); + } #endif /* The table is never full, there is always at least one NID_undef. */ @@ -3790,29 +3831,54 @@ return (0); } +#if defined(__amd64) || defined(__i386) /* Do we have an AES instruction set? */ static int -hw_aes_instruction_set_present(void) +hw_x86_aes_instruction_set_present(void) { static int present = -1; if (present == -1) { uint_t ui = 0; - (void) getisax(&ui, 1); - -#if defined(__amd64) || defined(__i386) present = (ui & AV_386_AES) > 0; -#elif defined(__sparc) - present = (ui & (AV_SPARC_AES|AV_SPARC_FJAES)) > 0; -#endif } return (present); } +#endif #if defined(__sparc) + +static int +hw_yf_aes_instruction_set_present(void) + { + static int present = -1; + if (present == -1) + { + uint_t ui = 0; + (void) getisax(&ui, 1); + present = (ui & (AV_SPARC_AES)) > 0; + } + return (present); + } + +/* Do we have a Fujitsu AES instruction set? */ +static int +hw_fj_aes_instruction_set_present(void) + { + static int present = -1; + if (present == -1) + { + uint_t ui = 0; + (void) getisax(&ui, 1); + present = (ui & (AV_SPARC_AES)) > 0; + } + + return (present); + } + static int hw_yf_digest_instruction_present(void) { diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/engines/t4/eng_t4.c --- a/components/openssl/openssl-1.0.1/engines/t4/eng_t4.c Wed Jul 17 00:17:02 2013 -0700 +++ b/components/openssl/openssl-1.0.1/engines/t4/eng_t4.c Wed Jul 17 15:19:38 2013 -0700 @@ -58,72 +58,24 @@ */ /* - * This engine supports SPARC microprocessors that provide AES and other + * This engine supports SPARC microprocessors that provide DES and other * cipher and hash instructions, such as the T4 microprocessor. */ #include -#if !defined(OPENSSL_NO_HW) && !defined(OPENSSL_NO_HW_AES_T4) && \ - !defined(OPENSSL_NO_AES) +#if !defined(OPENSSL_NO_HW) #include #include /* getisax() */ #include #include #include #include -#include #include -#include "eng_t4_aes_asm.h" #define T4_LIB_NAME "SPARC T4 engine" #include "eng_t4_err.c" -/* Copied from Solaris aes_impl.h */ -#ifndef MAX_AES_NR -#define MAX_AES_NR 14 /* Maximum number of rounds */ -#endif -#ifndef MAX_AES_NB -#define MAX_AES_NB 4 /* Number of columns comprising a state */ -#endif - -/* Index for the supported ciphers */ -typedef enum { - T4_AES_128_CBC, - T4_AES_192_CBC, - T4_AES_256_CBC, -#ifndef SOLARIS_NO_AES_CFB128 - T4_AES_128_CFB128, - T4_AES_192_CFB128, - T4_AES_256_CFB128, -#endif /* !SOLARIS_NO_AES_CFB128 */ - T4_AES_128_CTR, - T4_AES_192_CTR, - T4_AES_256_CTR, - T4_AES_128_ECB, - T4_AES_192_ECB, - T4_AES_256_ECB, - T4_CIPHER_MAX -} t4_cipher_id; - -/* T4 cipher context; must be 8-byte aligned (last field must be uint64_t) */ -typedef struct t4_cipher_ctx { - t4_cipher_id index; - uint64_t *iv; - uint64_t aligned_iv_buffer[2]; /* use if original IV unaligned */ - /* Encryption and decryption key schedule are the same: */ - uint64_t t4_ks[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; -} t4_cipher_ctx_t; - -typedef struct t4_cipher { - t4_cipher_id id; - int nid; - int iv_len; - int min_key_len; - int max_key_len; - unsigned long flags; -} t4_cipher_t; - /* Constants used when creating the ENGINE */ static const char *ENGINE_T4_ID = "t4"; static const char *ENGINE_T4_NAME = "SPARC T4 engine support"; @@ -165,10 +117,7 @@ #ifndef DYNAMIC_ENGINE #pragma inline(t4_bind) #endif -static t4_cipher_id get_cipher_index_by_nid(int nid); -#pragma inline(get_cipher_index_by_nid) -static void t4_instructions_present(_Bool *aes_present, _Bool *des_present, - _Bool *montmul_present); +static void t4_instructions_present(_Bool *des_present, _Bool *montmul_present); #pragma inline(t4_instructions_present) /* RSA_METHOD structure used by ENGINE_set_RSA() */ @@ -183,12 +132,6 @@ /* Static variables */ /* This can't be const as NID*ctr is inserted when the engine is initialized */ static int t4_cipher_nids[] = { - NID_aes_128_cbc, NID_aes_192_cbc, NID_aes_256_cbc, -#ifndef SOLARIS_NO_AES_CFB128 - NID_aes_128_cfb128, NID_aes_192_cfb128, NID_aes_256_cfb128, -#endif - NID_aes_128_ctr, NID_aes_192_ctr, NID_aes_256_ctr, - NID_aes_128_ecb, NID_aes_192_ecb, NID_aes_256_ecb, #ifndef OPENSSL_NO_DES /* Must be at end of list (see t4_des_cipher_count in t4_bind() */ NID_des_cbc, NID_des_ede3_cbc, NID_des_ecb, NID_des_ede3_ecb, @@ -198,66 +141,6 @@ static int t4_cipher_count = (sizeof (t4_cipher_nids) / sizeof (t4_cipher_nids[0])); -/* - * Cipher Table for all supported symmetric ciphers. - * Must be in same order as t4_cipher_id. - */ -static t4_cipher_t t4_cipher_table[] = { - /* ID NID IV min- max-key flags */ - {T4_AES_128_CBC, NID_aes_128_cbc, 16, 16, 16, 0}, - {T4_AES_192_CBC, NID_aes_192_cbc, 16, 24, 24, 0}, - {T4_AES_256_CBC, NID_aes_256_cbc, 16, 32, 32, 0}, -#ifndef SOLARIS_NO_AES_CFB128 - {T4_AES_128_CFB128, NID_aes_128_cfb128, 16, 16, 16, - EVP_CIPH_NO_PADDING}, - {T4_AES_192_CFB128, NID_aes_192_cfb128, 16, 24, 24, - EVP_CIPH_NO_PADDING}, - {T4_AES_256_CFB128, NID_aes_256_cfb128, 16, 32, 32, - EVP_CIPH_NO_PADDING}, -#endif - {T4_AES_128_CTR, NID_aes_128_ctr, 16, 16, 16, - EVP_CIPH_NO_PADDING}, - {T4_AES_192_CTR, NID_aes_192_ctr, 16, 24, 24, - EVP_CIPH_NO_PADDING}, - {T4_AES_256_CTR, NID_aes_256_ctr, 16, 32, 32, - EVP_CIPH_NO_PADDING}, - {T4_AES_128_ECB, NID_aes_128_ecb, 0, 16, 16, 0}, - {T4_AES_192_ECB, NID_aes_192_ecb, 0, 24, 24, 0}, - {T4_AES_256_ECB, NID_aes_256_ecb, 0, 32, 32, 0}, -}; - - -/* Formal declaration for functions in EVP_CIPHER structure */ -static int t4_cipher_init_aes(EVP_CIPHER_CTX *ctx, const unsigned char *key, - const unsigned char *iv, int enc); - -static int t4_cipher_do_aes_128_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_192_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_256_cbc(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -#ifndef SOLARIS_NO_AES_CFB128 -static int t4_cipher_do_aes_128_cfb128(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_192_cfb128(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_256_cfb128(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -#endif -static int t4_cipher_do_aes_128_ctr(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_192_ctr(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_256_ctr(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_128_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_192_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); -static int t4_cipher_do_aes_256_ecb(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, size_t inl); - /* * Cipher Algorithms @@ -274,120 +157,6 @@ * set_asn1_parameters(), get_asn1_parameters(), ctrl(), app_data */ -static const EVP_CIPHER t4_aes_128_cbc = { - NID_aes_128_cbc, - 16, 16, 16, - EVP_CIPH_CBC_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_128_cbc, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; -static const EVP_CIPHER t4_aes_192_cbc = { - NID_aes_192_cbc, - 16, 24, 16, - EVP_CIPH_CBC_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_192_cbc, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; -static const EVP_CIPHER t4_aes_256_cbc = { - NID_aes_256_cbc, - 16, 32, 16, - EVP_CIPH_CBC_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_256_cbc, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; - -#ifndef SOLARIS_NO_AES_CFB128 -static const EVP_CIPHER t4_aes_128_cfb128 = { - NID_aes_128_cfb128, - 16, 16, 16, - EVP_CIPH_CFB_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_128_cfb128, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; -static const EVP_CIPHER t4_aes_192_cfb128 = { - NID_aes_192_cfb128, - 16, 24, 16, - EVP_CIPH_CFB_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_192_cfb128, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; -static const EVP_CIPHER t4_aes_256_cfb128 = { - NID_aes_256_cfb128, - 16, 32, 16, - EVP_CIPH_CFB_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_256_cfb128, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; -#endif /* !SOLARIS_NO_AES_CFB128 */ - -static EVP_CIPHER t4_aes_128_ctr = { - NID_aes_128_ctr, - 16, 16, 16, - EVP_CIPH_CTR_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_128_ctr, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; -static EVP_CIPHER t4_aes_192_ctr = { - NID_aes_192_ctr, - 16, 24, 16, - EVP_CIPH_CTR_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_192_ctr, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; -static EVP_CIPHER t4_aes_256_ctr = { - NID_aes_256_ctr, - 16, 32, 16, - EVP_CIPH_CTR_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_256_ctr, NULL, - sizeof (t4_cipher_ctx_t), - EVP_CIPHER_set_asn1_iv, EVP_CIPHER_get_asn1_iv, - NULL, NULL -}; - -/* - * ECB modes don't use an Initial Vector, so that's why set_asn1_parameters, - * get_asn1_parameters, and cleanup fields are set to NULL. - */ -static const EVP_CIPHER t4_aes_128_ecb = { - NID_aes_128_ecb, - 16, 16, 0, - EVP_CIPH_ECB_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_128_ecb, NULL, - sizeof (t4_cipher_ctx_t), - NULL, NULL, NULL, NULL -}; -static const EVP_CIPHER t4_aes_192_ecb = { - NID_aes_192_ecb, - 16, 24, 0, - EVP_CIPH_ECB_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_192_ecb, NULL, - sizeof (t4_cipher_ctx_t), - NULL, NULL, NULL, NULL -}; -static const EVP_CIPHER t4_aes_256_ecb = { - NID_aes_256_ecb, - 16, 32, 0, - EVP_CIPH_ECB_MODE, - t4_cipher_init_aes, t4_cipher_do_aes_256_ecb, NULL, - sizeof (t4_cipher_ctx_t), - NULL, NULL, NULL, NULL -}; #ifndef OPENSSL_NO_DES extern const EVP_CIPHER t4_des_cbc; @@ -402,13 +171,12 @@ */ /* - * Set aes_present, des_present and montmul_present to B_FALSE or B_TRUE - * depending on whether the current SPARC processor supports AES, DES + * Set des_present and montmul_present to B_FALSE or B_TRUE + * depending on whether the current SPARC processor supports DES * and MONTMUL, respectively. */ static void -t4_instructions_present(_Bool *aes_present, _Bool *des_present, - _Bool *montmul_present) +t4_instructions_present(_Bool *des_present, _Bool *montmul_present) { #ifdef OPENSSL_NO_DES #undef AV_SPARC_DES @@ -417,7 +185,6 @@ uint_t ui; (void) getisax(&ui, 1); - *aes_present = ((ui & AV_SPARC_AES) != 0); *des_present = ((ui & AV_SPARC_DES) != 0); *montmul_present = ((ui & AV_SPARC_MONT) != 0); } @@ -443,35 +210,6 @@ } switch (nid) { - case NID_aes_128_cbc: - *cipher = &t4_aes_128_cbc; - break; - case NID_aes_192_cbc: - *cipher = &t4_aes_192_cbc; - break; - case NID_aes_256_cbc: - *cipher = &t4_aes_256_cbc; - break; - case NID_aes_128_ecb: - *cipher = &t4_aes_128_ecb; - break; - case NID_aes_192_ecb: - *cipher = &t4_aes_192_ecb; - break; - case NID_aes_256_ecb: - *cipher = &t4_aes_256_ecb; - break; -#ifndef SOLARIS_NO_AES_CFB128 - case NID_aes_128_cfb128: - *cipher = &t4_aes_128_cfb128; - break; - case NID_aes_192_cfb128: - *cipher = &t4_aes_192_cfb128; - break; - case NID_aes_256_cfb128: - *cipher = &t4_aes_256_cfb128; - break; -#endif /* !SOLARIS_NO_AES_CFB128 */ #ifndef OPENSSL_NO_DES case NID_des_cbc: *cipher = &t4_des_cbc; @@ -486,15 +224,6 @@ *cipher = &t4_des3_ecb; break; #endif /* !OPENSSL_NO_DES */ - case NID_aes_128_ctr: - *cipher = &t4_aes_128_ctr; - break; - case NID_aes_192_ctr: - *cipher = &t4_aes_192_ctr; - break; - case NID_aes_256_ctr: - *cipher = &t4_aes_256_ctr; - break; default: /* cipher not supported */ *cipher = NULL; @@ -505,260 +234,6 @@ } -/* Called by t4_cipher_init_aes() */ -static t4_cipher_id -get_cipher_index_by_nid(int nid) -{ - t4_cipher_id i; - - for (i = (t4_cipher_id)0; i < T4_CIPHER_MAX; ++i) - if (t4_cipher_table[i].nid == nid) - return (i); - return (T4_CIPHER_MAX); -} - - -/* ARGSUSED2 */ -static int -t4_cipher_init_aes(EVP_CIPHER_CTX *ctx, const unsigned char *key, - const unsigned char *iv, int enc) -{ - t4_cipher_ctx_t *tctx = ctx->cipher_data; - uint64_t *t4_ks = tctx->t4_ks; - t4_cipher_t *t4_cipher; - t4_cipher_id index; - int key_len = ctx->key_len; - uint64_t aligned_key_buffer[4]; /* 16, 24, or 32 bytes long */ - uint64_t *aligned_key; - - if (key == NULL) { - T4err(T4_F_CIPHER_INIT_AES, T4_R_CIPHER_KEY); - return (0); - } - - /* Get the cipher entry index in t4_cipher_table from nid */ - index = get_cipher_index_by_nid(ctx->cipher->nid); - if (index >= T4_CIPHER_MAX) { - T4err(T4_F_CIPHER_INIT_AES, T4_R_CIPHER_NID); - return (0); /* Error */ - } - t4_cipher = &t4_cipher_table[index]; - - /* Check key size and iv size */ - if (ctx->cipher->iv_len < t4_cipher->iv_len) { - T4err(T4_F_CIPHER_INIT_AES, T4_R_IV_LEN_INCORRECT); - return (0); /* Error */ - } - if ((key_len < t4_cipher->min_key_len) || - (key_len > t4_cipher->max_key_len)) { - T4err(T4_F_CIPHER_INIT_AES, T4_R_KEY_LEN_INCORRECT); - return (0); /* Error */ - } - - /* Set cipher flags, if any */ - ctx->flags |= t4_cipher->flags; - - /* Align the key */ - if (((unsigned long)key & 0x7) == 0) /* already aligned */ - aligned_key = (uint64_t *)key; - else { /* key is not 8-byte aligned */ -#ifdef DEBUG_T4 - (void) fprintf(stderr, "T4: key is not 8 byte aligned\n"); -#endif - (void) memcpy(aligned_key_buffer, key, key_len); - aligned_key = aligned_key_buffer; - } - - - /* - * Expand the key schedule. - * Copy original key to start of t4_ks key schedule. Note that the - * encryption and decryption key schedule are the same for T4. - */ - switch (key_len) { - case 16: - t4_aes_expand128(&t4_ks[2], - (const uint32_t *)aligned_key); - t4_ks[0] = aligned_key[0]; - t4_ks[1] = aligned_key[1]; - break; - case 24: - t4_aes_expand192(&t4_ks[3], - (const uint32_t *)aligned_key); - t4_ks[0] = aligned_key[0]; - t4_ks[1] = aligned_key[1]; - t4_ks[2] = aligned_key[2]; - break; - case 32: - t4_aes_expand256(&t4_ks[4], - (const uint32_t *)aligned_key); - t4_ks[0] = aligned_key[0]; - t4_ks[1] = aligned_key[1]; - t4_ks[2] = aligned_key[2]; - t4_ks[3] = aligned_key[3]; - break; - default: - T4err(T4_F_CIPHER_INIT_AES, T4_R_CIPHER_KEY); - return (0); - } - - /* Save index to cipher */ - tctx->index = index; - - /* Align IV, if needed */ - if (t4_cipher->iv_len <= 0) { /* no IV (such as with ECB mode) */ - tctx->iv = NULL; - } else if (((unsigned long)ctx->iv & 0x7) == 0) { /* already aligned */ - tctx->iv = (uint64_t *)ctx->iv; - } else { - /* IV is not 8 byte aligned */ - (void) memcpy(tctx->aligned_iv_buffer, ctx->iv, - ctx->cipher->iv_len); - tctx->iv = tctx->aligned_iv_buffer; -#ifdef DEBUG_T4 - (void) fprintf(stderr, - "t4_cipher_init_aes: IV is not 8 byte aligned\n"); - (void) fprintf(stderr, - "t4_cipher_init_aes: ctx->cipher->iv_len =%d\n", - ctx->cipher->iv_len); - (void) fprintf(stderr, "t4_cipher_init_aes: after " - "re-alignment, tctx->iv = %p\n", (void *)tctx->iv); -#endif /* DEBUG_T4 */ - } - - return (1); -} - - -/* - * ENCRYPT_UPDATE or DECRYPT_UPDATE - */ -#define T4_CIPHER_DO_AES(t4_cipher_do_aes, t4_aes_load_keys_for_encrypt, \ - t4_aes_encrypt, t4_aes_load_keys_for_decrypt, t4_aes_decrypt, iv) \ -static int \ -t4_cipher_do_aes(EVP_CIPHER_CTX *ctx, unsigned char *out, \ - const unsigned char *in, size_t inl) \ -{ \ - t4_cipher_ctx_t *tctx = ctx->cipher_data; \ - uint64_t *t4_ks = tctx->t4_ks; \ - unsigned long outl = inl; \ - unsigned char *bufin_alloc = NULL, *bufout_alloc = NULL; \ - unsigned char *bufin, *bufout; \ - \ - /* "in" and "out" must be 8 byte aligned */ \ - if (((unsigned long)in & 0x7) == 0) { /* already aligned */ \ - bufin = (unsigned char *)in; \ - } else { /* "in" is not 8 byte aligned */ \ - if (((unsigned long)out & 0x7) == 0) { /* aligned */ \ - /* use output buffer for input */ \ - bufin = out; \ - } else { \ - bufin = bufin_alloc = OPENSSL_malloc(inl); \ - if (bufin_alloc == NULL) \ - return (0); /* error */ \ - } \ - (void) memcpy(bufin, in, inl); \ - } \ - \ - if (((unsigned long)out & 0x7) == 0) { /* already aligned */ \ - bufout = out; \ - } else { /* "out" is not 8 byte aligned */ \ - if (bufin_alloc != NULL) { \ - /* use allocated input buffer for output */ \ - bufout = bufin_alloc; \ - } else { \ - bufout = bufout_alloc = OPENSSL_malloc(outl); \ - if (bufout_alloc == NULL) { \ - OPENSSL_free(bufin_alloc); \ - return (0); /* error */ \ - } \ - } \ - } \ - \ - /* Data length must be an even multiple of block size. */ \ - if ((inl & 0xf) != 0) { \ - OPENSSL_free(bufout_alloc); \ - OPENSSL_free(bufin_alloc); \ - T4err(T4_F_CIPHER_DO_AES, T4_R_NOT_BLOCKSIZE_LENGTH); \ - return (0); \ - } \ - \ - if (ctx->encrypt) { \ - t4_aes_load_keys_for_encrypt(t4_ks); \ - t4_aes_encrypt(t4_ks, (uint64_t *)bufin, \ - (uint64_t *)bufout, (size_t)inl, iv); \ - } else { /* decrypt */ \ - t4_aes_load_keys_for_decrypt(t4_ks); \ - t4_aes_decrypt(t4_ks, (uint64_t *)bufin, \ - (uint64_t *)bufout, (size_t)inl, iv); \ - } \ - \ - /* Cleanup */ \ - if (bufin_alloc != NULL) { \ - if (bufout == bufin_alloc) \ - (void) memcpy(out, bufout, outl); \ - OPENSSL_free(bufin_alloc); \ - } \ - if (bufout_alloc != NULL) { \ - (void) memcpy(out, bufout_alloc, outl); \ - OPENSSL_free(bufout_alloc); \ - } \ - \ - return (1); \ -} - - -/* AES CBC mode. */ -T4_CIPHER_DO_AES(t4_cipher_do_aes_128_cbc, - t4_aes128_load_keys_for_encrypt, t4_aes128_cbc_encrypt, - t4_aes128_load_keys_for_decrypt, t4_aes128_cbc_decrypt, tctx->iv) -T4_CIPHER_DO_AES(t4_cipher_do_aes_192_cbc, - t4_aes192_load_keys_for_encrypt, t4_aes192_cbc_encrypt, - t4_aes192_load_keys_for_decrypt, t4_aes192_cbc_decrypt, tctx->iv) -T4_CIPHER_DO_AES(t4_cipher_do_aes_256_cbc, - t4_aes256_load_keys_for_encrypt, t4_aes256_cbc_encrypt, - t4_aes256_load_keys_for_decrypt, t4_aes256_cbc_decrypt, tctx->iv) - -/* - * AES CFB128 mode. - * CFB128 decrypt uses load_keys_for_encrypt() as the mode uses - * the raw AES encrypt operation for the decryption, too. - */ -#ifndef SOLARIS_NO_AES_CFB128 -T4_CIPHER_DO_AES(t4_cipher_do_aes_128_cfb128, - t4_aes128_load_keys_for_encrypt, t4_aes128_cfb128_encrypt, - t4_aes128_load_keys_for_encrypt, t4_aes128_cfb128_decrypt, tctx->iv) -T4_CIPHER_DO_AES(t4_cipher_do_aes_192_cfb128, - t4_aes192_load_keys_for_encrypt, t4_aes192_cfb128_encrypt, - t4_aes192_load_keys_for_encrypt, t4_aes192_cfb128_decrypt, tctx->iv) -T4_CIPHER_DO_AES(t4_cipher_do_aes_256_cfb128, - t4_aes256_load_keys_for_encrypt, t4_aes256_cfb128_encrypt, - t4_aes256_load_keys_for_encrypt, t4_aes256_cfb128_decrypt, tctx->iv) -#endif /* !SOLARIS_NO_AES_CFB128 */ - -/* AES CTR mode. */ -T4_CIPHER_DO_AES(t4_cipher_do_aes_128_ctr, - t4_aes128_load_keys_for_encrypt, t4_aes128_ctr_crypt, - t4_aes128_load_keys_for_decrypt, t4_aes128_ctr_crypt, tctx->iv) -T4_CIPHER_DO_AES(t4_cipher_do_aes_192_ctr, - t4_aes192_load_keys_for_encrypt, t4_aes192_ctr_crypt, - t4_aes192_load_keys_for_decrypt, t4_aes192_ctr_crypt, tctx->iv) -T4_CIPHER_DO_AES(t4_cipher_do_aes_256_ctr, - t4_aes256_load_keys_for_encrypt, t4_aes256_ctr_crypt, - t4_aes256_load_keys_for_decrypt, t4_aes256_ctr_crypt, tctx->iv) - -/* AES ECB mode. */ -T4_CIPHER_DO_AES(t4_cipher_do_aes_128_ecb, - t4_aes128_load_keys_for_encrypt, t4_aes128_ecb_encrypt, - t4_aes128_load_keys_for_decrypt, t4_aes128_ecb_decrypt, NULL) -T4_CIPHER_DO_AES(t4_cipher_do_aes_192_ecb, - t4_aes192_load_keys_for_encrypt, t4_aes192_ecb_encrypt, - t4_aes192_load_keys_for_decrypt, t4_aes192_ecb_decrypt, NULL) -T4_CIPHER_DO_AES(t4_cipher_do_aes_256_ecb, - t4_aes256_load_keys_for_encrypt, t4_aes256_ecb_encrypt, - t4_aes256_load_keys_for_decrypt, t4_aes256_ecb_decrypt, NULL) - - /* * Is the t4 engine available? * Passed to ENGINE_set_init_function(). @@ -789,12 +264,12 @@ static int t4_bind(ENGINE *e) { - _Bool aes_engage, des_engage, montmul_engage; + _Bool des_engage, montmul_engage; - t4_instructions_present(&aes_engage, &des_engage, &montmul_engage); + t4_instructions_present(&des_engage, &montmul_engage); #ifdef DEBUG_T4 (void) fprintf(stderr, - "t4_bind: engage aes=%d, des=%d\n", aes_engage, des_engage); + "t4_bind: engage des=%d\n", des_engage); #endif #ifndef OPENSSL_NO_DES if (!des_engage) { /* Remove DES ciphers from list */ @@ -814,9 +289,9 @@ /* Register T4 engine ID, name, and functions */ if (!ENGINE_set_id(e, ENGINE_T4_ID) || !ENGINE_set_name(e, - aes_engage ? ENGINE_T4_NAME: ENGINE_NO_T4_NAME) || + des_engage ? ENGINE_T4_NAME: ENGINE_NO_T4_NAME) || !ENGINE_set_init_function(e, t4_init) || - (aes_engage && !ENGINE_set_ciphers(e, t4_get_all_ciphers)) || + (des_engage && !ENGINE_set_ciphers(e, t4_get_all_ciphers)) || #ifndef OPENSSL_NO_RSA (montmul_engage && !ENGINE_set_RSA(e, t4_RSA())) || #endif /* OPENSSL_NO_RSA */ @@ -860,4 +335,4 @@ IMPLEMENT_DYNAMIC_BIND_FN(t4_bind_helper) #endif /* DYNAMIC_ENGINE */ #endif /* COMPILE_HW_T4 */ -#endif /* !OPENSSL_NO_HW && !OPENSSL_NO_HW_AES_T4 && !OPENSSL_NO_AES */ +#endif /* !OPENSSL_NO_HW */ diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/inline-t4/aest4-sparcv9.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/components/openssl/openssl-1.0.1/inline-t4/aest4-sparcv9.pl Wed Jul 17 15:19:38 2013 -0700 @@ -0,0 +1,902 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by David S. Miller and Andy Polyakov +# . The module is licensed under 2-clause BSD +# license. October 2012. All rights reserved. +# ==================================================================== + +###################################################################### +# AES for SPARC T4. +# +# AES round instructions complete in 3 cycles and can be issued every +# cycle. It means that round calculations should take 4*rounds cycles, +# because any given round instruction depends on result of *both* +# previous instructions: +# +# |0 |1 |2 |3 |4 +# |01|01|01| +# |23|23|23| +# |01|01|... +# |23|... +# +# Provided that fxor [with IV] takes 3 cycles to complete, critical +# path length for CBC encrypt would be 3+4*rounds, or in other words +# it should process one byte in at least (3+4*rounds)/16 cycles. This +# estimate doesn't account for "collateral" instructions, such as +# fetching input from memory, xor-ing it with zero-round key and +# storing the result. Yet, *measured* performance [for data aligned +# at 64-bit boundary!] deviates from this equation by less than 0.5%: +# +# 128-bit key 192- 256- +# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90 +# (*) numbers after slash are for +# misaligned data. +# +# Out-of-order execution logic managed to fully overlap "collateral" +# instructions with those on critical path. Amazing! +# +# As with Intel AES-NI, question is if it's possible to improve +# performance of parallelizeable modes by interleaving round +# instructions. Provided round instruction latency and throughput +# optimal interleave factor is 2. But can we expect 2x performance +# improvement? Well, as round instructions can be issued one per +# cycle, they don't saturate the 2-way issue pipeline and therefore +# there is room for "collateral" calculations... Yet, 2x speed-up +# over CBC encrypt remains unattaintable: +# +# 128-bit key 192- 256- +# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61 +# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61 +# (*) numbers after slash are for +# misaligned data. +# +# Estimates based on amount of instructions under assumption that +# round instructions are not pairable with any other instruction +# suggest that latter is the actual case and pipeline runs +# underutilized. It should be noted that T4 out-of-order execution +# logic is so capable that performance gain from 2x interleave is +# not even impressive, ~7-13% over non-interleaved code, largest +# for 256-bit keys. + +# To anchor to something else, software implementation processes +# one byte in 29 cycles with 128-bit key on same processor. Intel +# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts +# in 0.93, naturally with AES-NI. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "sparcv9_modes.pl"; + +&asm_init(@ARGV); + +$::evp=1; # if $evp is set to 0, script generates module with +# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry +# points. These however are not fully compatible with openssl/aes.h, +# because they expect AES_KEY to be aligned at 64-bit boundary. When +# used through EVP, alignment is arranged at EVP layer. Second thing +# that is arranged by EVP is at least 32-bit alignment of IV. + +###################################################################### +# single-round subroutines +# +{ +my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5)); + +$code=<<___; +.text + +.globl aes_t4_encrypt +.align 32 +aes_t4_encrypt: + andcc $inp, 7, %g1 ! is input aligned? + andn $inp, 7, $inp + + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 + + ldx [$inp + 0], %o4 + bz,pt %icc, 1f + ldx [$inp + 8], %o5 + ldx [$inp + 16], $inp + sll %g1, 3, %g1 + sub %g0, %g1, %o3 + sllx %o4, %g1, %o4 + sllx %o5, %g1, %g1 + srlx %o5, %o3, %o5 + srlx $inp, %o3, %o3 + or %o5, %o4, %o4 + or %o3, %g1, %o5 +1: + ld [$key + 240], $rounds + ldd [$key + 16], %f12 + ldd [$key + 24], %f14 + xor %g4, %o4, %o4 + xor %g5, %o5, %o5 + movxtod %o4, %f0 + movxtod %o5, %f2 + srl $rounds, 1, $rounds + ldd [$key + 32], %f16 + sub $rounds, 1, $rounds + ldd [$key + 40], %f18 + add $key, 48, $key + +.Lenc: + aes_eround01 %f12, %f0, %f2, %f4 + aes_eround23 %f14, %f0, %f2, %f2 + ldd [$key + 0], %f12 + ldd [$key + 8], %f14 + sub $rounds,1,$rounds + aes_eround01 %f16, %f4, %f2, %f0 + aes_eround23 %f18, %f4, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + brnz,pt $rounds, .Lenc + add $key, 32, $key + + andcc $out, 7, $tmp ! is output aligned? + aes_eround01 %f12, %f0, %f2, %f4 + aes_eround23 %f14, %f0, %f2, %f2 + aes_eround01_l %f16, %f4, %f2, %f0 + aes_eround23_l %f18, %f4, %f2, %f2 + + bnz,pn %icc, 2f + nop + + std %f0, [$out + 0] + retl + std %f2, [$out + 8] + +2: alignaddrl $out, %g0, $out + mov 0xff, $mask + srl $mask, $tmp, $mask + + faligndata %f0, %f0, %f4 + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + + stda %f4, [$out + $mask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $mask, $mask + retl + stda %f8, [$out + $mask]0xc0 ! partial store +.type aes_t4_encrypt,#function +.size aes_t4_encrypt,.-aes_t4_encrypt + +.globl aes_t4_decrypt +.align 32 +aes_t4_decrypt: + andcc $inp, 7, %g1 ! is input aligned? + andn $inp, 7, $inp + + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 + + ldx [$inp + 0], %o4 + bz,pt %icc, 1f + ldx [$inp + 8], %o5 + ldx [$inp + 16], $inp + sll %g1, 3, %g1 + sub %g0, %g1, %o3 + sllx %o4, %g1, %o4 + sllx %o5, %g1, %g1 + srlx %o5, %o3, %o5 + srlx $inp, %o3, %o3 + or %o5, %o4, %o4 + or %o3, %g1, %o5 +1: + ld [$key + 240], $rounds + ldd [$key + 16], %f12 + ldd [$key + 24], %f14 + xor %g4, %o4, %o4 + xor %g5, %o5, %o5 + movxtod %o4, %f0 + movxtod %o5, %f2 + srl $rounds, 1, $rounds + ldd [$key + 32], %f16 + sub $rounds, 1, $rounds + ldd [$key + 40], %f18 + add $key, 48, $key + +.Ldec: + aes_dround01 %f12, %f0, %f2, %f4 + aes_dround23 %f14, %f0, %f2, %f2 + ldd [$key + 0], %f12 + ldd [$key + 8], %f14 + sub $rounds,1,$rounds + aes_dround01 %f16, %f4, %f2, %f0 + aes_dround23 %f18, %f4, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + brnz,pt $rounds, .Ldec + add $key, 32, $key + + andcc $out, 7, $tmp ! is output aligned? + aes_dround01 %f12, %f0, %f2, %f4 + aes_dround23 %f14, %f0, %f2, %f2 + aes_dround01_l %f16, %f4, %f2, %f0 + aes_dround23_l %f18, %f4, %f2, %f2 + + bnz,pn %icc, 2f + nop + + std %f0, [$out + 0] + retl + std %f2, [$out + 8] + +2: alignaddrl $out, %g0, $out + mov 0xff, $mask + srl $mask, $tmp, $mask + + faligndata %f0, %f0, %f4 + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + + stda %f4, [$out + $mask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $mask, $mask + retl + stda %f8, [$out + $mask]0xc0 ! partial store +.type aes_t4_decrypt,#function +.size aes_t4_decrypt,.-aes_t4_decrypt +___ +} + +###################################################################### +# key setup subroutines +# +{ +my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5)); +$code.=<<___; +.globl aes_t4_set_encrypt_key +.align 32 +aes_t4_set_encrypt_key: +.Lset_encrypt_key: + and $inp, 7, $tmp + alignaddr $inp, %g0, $inp + cmp $bits, 192 + ldd [$inp + 0], %f0 + bl,pt %icc,.L128 + ldd [$inp + 8], %f2 + + be,pt %icc,.L192 + ldd [$inp + 16], %f4 + brz,pt $tmp, .L256aligned + ldd [$inp + 24], %f6 + + ldd [$inp + 32], %f8 + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 + faligndata %f4, %f6, %f4 + faligndata %f6, %f8, %f6 +.L256aligned: +___ +for ($i=0; $i<6; $i++) { + $code.=<<___; + std %f0, [$out + `32*$i+0`] + aes_kexpand1 %f0, %f6, $i, %f0 + std %f2, [$out + `32*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `32*$i+16`] + aes_kexpand0 %f4, %f2, %f4 + std %f6, [$out + `32*$i+24`] + aes_kexpand2 %f6, %f4, %f6 +___ +} +$code.=<<___; + std %f0, [$out + `32*$i+0`] + aes_kexpand1 %f0, %f6, $i, %f0 + std %f2, [$out + `32*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `32*$i+16`] + std %f6, [$out + `32*$i+24`] + std %f0, [$out + `32*$i+32`] + std %f2, [$out + `32*$i+40`] + + mov 14, $tmp + st $tmp, [$out + 240] + retl + xor %o0, %o0, %o0 + +.align 16 +.L192: + brz,pt $tmp, .L192aligned + nop + + ldd [$inp + 24], %f6 + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 + faligndata %f4, %f6, %f4 +.L192aligned: +___ +for ($i=0; $i<7; $i++) { + $code.=<<___; + std %f0, [$out + `24*$i+0`] + aes_kexpand1 %f0, %f4, $i, %f0 + std %f2, [$out + `24*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `24*$i+16`] + aes_kexpand2 %f4, %f2, %f4 +___ +} +$code.=<<___; + std %f0, [$out + `24*$i+0`] + aes_kexpand1 %f0, %f4, $i, %f0 + std %f2, [$out + `24*$i+8`] + aes_kexpand2 %f2, %f0, %f2 + std %f4, [$out + `24*$i+16`] + std %f0, [$out + `24*$i+24`] + std %f2, [$out + `24*$i+32`] + + mov 12, $tmp + st $tmp, [$out + 240] + retl + xor %o0, %o0, %o0 + +.align 16 +.L128: + brz,pt $tmp, .L128aligned + nop + + ldd [$inp + 16], %f4 + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 +.L128aligned: +___ +for ($i=0; $i<10; $i++) { + $code.=<<___; + std %f0, [$out + `16*$i+0`] + aes_kexpand1 %f0, %f2, $i, %f0 + std %f2, [$out + `16*$i+8`] + aes_kexpand2 %f2, %f0, %f2 +___ +} +$code.=<<___; + std %f0, [$out + `16*$i+0`] + std %f2, [$out + `16*$i+8`] + + mov 10, $tmp + st $tmp, [$out + 240] + retl + xor %o0, %o0, %o0 +.type aes_t4_set_encrypt_key,#function +.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key + +.globl aes_t4_set_decrypt_key +.align 32 +aes_t4_set_decrypt_key: + mov %o7, %o5 + call .Lset_encrypt_key + nop + + mov %o5, %o7 + sll $tmp, 4, $inp ! $tmp is number of rounds + add $tmp, 2, $tmp + add $out, $inp, $inp ! $inp=$out+16*rounds + srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4 + +.Lkey_flip: + ldd [$out + 0], %f0 + ldd [$out + 8], %f2 + ldd [$out + 16], %f4 + ldd [$out + 24], %f6 + ldd [$inp + 0], %f8 + ldd [$inp + 8], %f10 + ldd [$inp - 16], %f12 + ldd [$inp - 8], %f14 + sub $tmp, 1, $tmp + std %f0, [$inp + 0] + std %f2, [$inp + 8] + std %f4, [$inp - 16] + std %f6, [$inp - 8] + std %f8, [$out + 0] + std %f10, [$out + 8] + std %f12, [$out + 16] + std %f14, [$out + 24] + add $out, 32, $out + brnz $tmp, .Lkey_flip + sub $inp, 32, $inp + + retl + xor %o0, %o0, %o0 +.type aes_t4_set_decrypt_key,#function +.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key +___ +} + +{{{ +my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5)); +my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7)); + +$code.=<<___; +.align 32 +_aes128_loadkey: + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 +___ +for ($i=2; $i<22;$i++) { # load key schedule + $code.=<<___; + ldd [$key + `8*$i`], %f`12+2*$i` +___ +} +$code.=<<___; + retl + nop +.type _aes128_loadkey,#function +.size _aes128_loadkey,.-_aes128_loadkey +_aes128_load_enckey=_aes128_loadkey +_aes128_load_deckey=_aes128_loadkey + +.align 32 +_aes128_encrypt_1x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_eround01 %f48, %f0, %f2, %f4 + aes_eround23 %f50, %f0, %f2, %f2 + aes_eround01_l %f52, %f4, %f2, %f0 + retl + aes_eround23_l %f54, %f4, %f2, %f2 +.type _aes128_encrypt_1x,#function +.size _aes128_encrypt_1x,.-_aes128_encrypt_1x + +.align 32 +_aes128_encrypt_2x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_eround01 %f48, %f0, %f2, %f8 + aes_eround23 %f50, %f0, %f2, %f2 + aes_eround01 %f48, %f4, %f6, %f10 + aes_eround23 %f50, %f4, %f6, %f6 + aes_eround01_l %f52, %f8, %f2, %f0 + aes_eround23_l %f54, %f8, %f2, %f2 + aes_eround01_l %f52, %f10, %f6, %f4 + retl + aes_eround23_l %f54, %f10, %f6, %f6 +.type _aes128_encrypt_2x,#function +.size _aes128_encrypt_2x,.-_aes128_encrypt_2x + +.align 32 +_aes128_decrypt_1x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_dround01 %f48, %f0, %f2, %f4 + aes_dround23 %f50, %f0, %f2, %f2 + aes_dround01_l %f52, %f4, %f2, %f0 + retl + aes_dround23_l %f54, %f4, %f2, %f2 +.type _aes128_decrypt_1x,#function +.size _aes128_decrypt_1x,.-_aes128_decrypt_1x + +.align 32 +_aes128_decrypt_2x: +___ +for ($i=0; $i<4; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_dround01 %f48, %f0, %f2, %f8 + aes_dround23 %f50, %f0, %f2, %f2 + aes_dround01 %f48, %f4, %f6, %f10 + aes_dround23 %f50, %f4, %f6, %f6 + aes_dround01_l %f52, %f8, %f2, %f0 + aes_dround23_l %f54, %f8, %f2, %f2 + aes_dround01_l %f52, %f10, %f6, %f4 + retl + aes_dround23_l %f54, %f10, %f6, %f6 +.type _aes128_decrypt_2x,#function +.size _aes128_decrypt_2x,.-_aes128_decrypt_2x + +.align 32 +_aes192_loadkey: +_aes256_loadkey: + ldx [$key + 0], %g4 + ldx [$key + 8], %g5 +___ +for ($i=2; $i<26;$i++) { # load key schedule + $code.=<<___; + ldd [$key + `8*$i`], %f`12+2*$i` +___ +} +$code.=<<___; + retl + nop +.type _aes192_loadkey,#function +.size _aes192_loadkey,.-_aes192_loadkey +_aes192_load_enckey=_aes192_loadkey +_aes192_load_deckey=_aes192_loadkey +_aes256_load_enckey=_aes192_loadkey +_aes256_load_deckey=_aes192_loadkey + +.align 32 +_aes192_encrypt_1x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_eround01 %f56, %f0, %f2, %f4 + aes_eround23 %f58, %f0, %f2, %f2 + aes_eround01_l %f60, %f4, %f2, %f0 + retl + aes_eround23_l %f62, %f4, %f2, %f2 +.type _aes192_encrypt_1x,#function +.size _aes192_encrypt_1x,.-_aes192_encrypt_1x + +.align 32 +_aes192_encrypt_2x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_eround01 %f56, %f0, %f2, %f8 + aes_eround23 %f58, %f0, %f2, %f2 + aes_eround01 %f56, %f4, %f6, %f10 + aes_eround23 %f58, %f4, %f6, %f6 + aes_eround01_l %f60, %f8, %f2, %f0 + aes_eround23_l %f62, %f8, %f2, %f2 + aes_eround01_l %f60, %f10, %f6, %f4 + retl + aes_eround23_l %f62, %f10, %f6, %f6 +.type _aes192_encrypt_2x,#function +.size _aes192_encrypt_2x,.-_aes192_encrypt_2x + +.align 32 +_aes192_decrypt_1x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_dround01 %f56, %f0, %f2, %f4 + aes_dround23 %f58, %f0, %f2, %f2 + aes_dround01_l %f60, %f4, %f2, %f0 + retl + aes_dround23_l %f62, %f4, %f2, %f2 +.type _aes192_decrypt_1x,#function +.size _aes192_decrypt_1x,.-_aes192_decrypt_1x + +.align 32 +_aes192_decrypt_2x: +___ +for ($i=0; $i<5; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_dround01 %f56, %f0, %f2, %f8 + aes_dround23 %f58, %f0, %f2, %f2 + aes_dround01 %f56, %f4, %f6, %f10 + aes_dround23 %f58, %f4, %f6, %f6 + aes_dround01_l %f60, %f8, %f2, %f0 + aes_dround23_l %f62, %f8, %f2, %f2 + aes_dround01_l %f60, %f10, %f6, %f4 + retl + aes_dround23_l %f62, %f10, %f6, %f6 +.type _aes192_decrypt_2x,#function +.size _aes192_decrypt_2x,.-_aes192_decrypt_2x + +.align 32 +_aes256_encrypt_1x: + aes_eround01 %f16, %f0, %f2, %f4 + aes_eround23 %f18, %f0, %f2, %f2 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_eround01 %f20, %f4, %f2, %f0 + aes_eround23 %f22, %f4, %f2, %f2 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_eround01 %f16, %f0, %f2, %f4 + aes_eround23 %f18, %f0, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_eround01_l %f20, %f4, %f2, %f0 + aes_eround23_l %f22, %f4, %f2, %f2 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_encrypt_1x,#function +.size _aes256_encrypt_1x,.-_aes256_encrypt_1x + +.align 32 +_aes256_encrypt_2x: + aes_eround01 %f16, %f0, %f2, %f8 + aes_eround23 %f18, %f0, %f2, %f2 + aes_eround01 %f16, %f4, %f6, %f10 + aes_eround23 %f18, %f4, %f6, %f6 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_eround01 %f20, %f8, %f2, %f0 + aes_eround23 %f22, %f8, %f2, %f2 + aes_eround01 %f20, %f10, %f6, %f4 + aes_eround23 %f22, %f10, %f6, %f6 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_eround01 %f16, %f0, %f2, %f8 + aes_eround23 %f18, %f0, %f2, %f2 + aes_eround01 %f16, %f4, %f6, %f10 + aes_eround23 %f18, %f4, %f6, %f6 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_eround01_l %f20, %f8, %f2, %f0 + aes_eround23_l %f22, %f8, %f2, %f2 + aes_eround01_l %f20, %f10, %f6, %f4 + aes_eround23_l %f22, %f10, %f6, %f6 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_encrypt_2x,#function +.size _aes256_encrypt_2x,.-_aes256_encrypt_2x + +.align 32 +_aes256_decrypt_1x: + aes_dround01 %f16, %f0, %f2, %f4 + aes_dround23 %f18, %f0, %f2, %f2 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_dround01 %f20, %f4, %f2, %f0 + aes_dround23 %f22, %f4, %f2, %f2 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2 +___ +} +$code.=<<___; + aes_dround01 %f16, %f0, %f2, %f4 + aes_dround23 %f18, %f0, %f2, %f2 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_dround01_l %f20, %f4, %f2, %f0 + aes_dround23_l %f22, %f4, %f2, %f2 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_decrypt_1x,#function +.size _aes256_decrypt_1x,.-_aes256_decrypt_1x + +.align 32 +_aes256_decrypt_2x: + aes_dround01 %f16, %f0, %f2, %f8 + aes_dround23 %f18, %f0, %f2, %f2 + aes_dround01 %f16, %f4, %f6, %f10 + aes_dround23 %f18, %f4, %f6, %f6 + ldd [$key + 208], %f16 + ldd [$key + 216], %f18 + aes_dround01 %f20, %f8, %f2, %f0 + aes_dround23 %f22, %f8, %f2, %f2 + aes_dround01 %f20, %f10, %f6, %f4 + aes_dround23 %f22, %f10, %f6, %f6 + ldd [$key + 224], %f20 + ldd [$key + 232], %f22 +___ +for ($i=1; $i<6; $i++) { + $code.=<<___; + aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8 + aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2 + aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10 + aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6 + aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0 + aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2 + aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4 + aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6 +___ +} +$code.=<<___; + aes_dround01 %f16, %f0, %f2, %f8 + aes_dround23 %f18, %f0, %f2, %f2 + aes_dround01 %f16, %f4, %f6, %f10 + aes_dround23 %f18, %f4, %f6, %f6 + ldd [$key + 16], %f16 + ldd [$key + 24], %f18 + aes_dround01_l %f20, %f8, %f2, %f0 + aes_dround23_l %f22, %f8, %f2, %f2 + aes_dround01_l %f20, %f10, %f6, %f4 + aes_dround23_l %f22, %f10, %f6, %f6 + ldd [$key + 32], %f20 + retl + ldd [$key + 40], %f22 +.type _aes256_decrypt_2x,#function +.size _aes256_decrypt_2x,.-_aes256_decrypt_2x +___ + +&alg_cbc_encrypt_implement("aes",128); +&alg_cbc_encrypt_implement("aes",192); +&alg_cbc_encrypt_implement("aes",256); + +&alg_cbc_decrypt_implement("aes",128); +&alg_cbc_decrypt_implement("aes",192); +&alg_cbc_decrypt_implement("aes",256); + +if ($::evp) { + &alg_ctr32_implement("aes",128); + &alg_ctr32_implement("aes",192); + &alg_ctr32_implement("aes",256); +} +}}} + +if (!$::evp) { +$code.=<<___; +.global AES_encrypt +AES_encrypt=aes_t4_encrypt +.global AES_decrypt +AES_decrypt=aes_t4_decrypt +.global AES_set_encrypt_key +.align 32 +AES_set_encrypt_key: + andcc %o2, 7, %g0 ! check alignment + bnz,a,pn %icc, 1f + mov -1, %o0 + brz,a,pn %o0, 1f + mov -1, %o0 + brz,a,pn %o2, 1f + mov -1, %o0 + andncc %o1, 0x1c0, %g0 + bnz,a,pn %icc, 1f + mov -2, %o0 + cmp %o1, 128 + bl,a,pn %icc, 1f + mov -2, %o0 + b aes_t4_set_encrypt_key + nop +1: retl + nop +.type AES_set_encrypt_key,#function +.size AES_set_encrypt_key,.-AES_set_encrypt_key + +.global AES_set_decrypt_key +.align 32 +AES_set_decrypt_key: + andcc %o2, 7, %g0 ! check alignment + bnz,a,pn %icc, 1f + mov -1, %o0 + brz,a,pn %o0, 1f + mov -1, %o0 + brz,a,pn %o2, 1f + mov -1, %o0 + andncc %o1, 0x1c0, %g0 + bnz,a,pn %icc, 1f + mov -2, %o0 + cmp %o1, 128 + bl,a,pn %icc, 1f + mov -2, %o0 + b aes_t4_set_decrypt_key + nop +1: retl + nop +.type AES_set_decrypt_key,#function +.size AES_set_decrypt_key,.-AES_set_decrypt_key +___ + +my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5)); + +$code.=<<___; +.globl AES_cbc_encrypt +.align 32 +AES_cbc_encrypt: + ld [$key + 240], %g1 + nop + brz $enc, .Lcbc_decrypt + cmp %g1, 12 + + bl,pt %icc, aes128_t4_cbc_encrypt + nop + be,pn %icc, aes192_t4_cbc_encrypt + nop + ba aes256_t4_cbc_encrypt + nop + +.Lcbc_decrypt: + bl,pt %icc, aes128_t4_cbc_decrypt + nop + be,pn %icc, aes192_t4_cbc_decrypt + nop + ba aes256_t4_cbc_decrypt + nop +.type AES_cbc_encrypt,#function +.size AES_cbc_encrypt,.-AES_cbc_encrypt +___ +} +$code.=<<___; +.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov" +.align 4 +___ + +&emit_assembler(); + +close STDOUT; diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/inline-t4/sparcv9_modes.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/components/openssl/openssl-1.0.1/inline-t4/sparcv9_modes.pl Wed Jul 17 15:19:38 2013 -0700 @@ -0,0 +1,1680 @@ +#!/usr/bin/env perl + +# Specific modes implementations for SPARC Architecture 2011. There +# is T4 dependency though, an ASI value that is not specified in the +# Architecture Manual. But as SPARC universe is rather monocultural, +# we imply that processor capable of executing crypto instructions +# can handle the ASI in question as well. This means that we ought to +# keep eyes open when new processors emerge... +# +# As for above mentioned ASI. It's so called "block initializing +# store" which cancels "read" in "read-update-write" on cache lines. +# This is "cooperative" optimization, as it reduces overall pressure +# on memory interface. Benefits can't be observed/quantified with +# usual benchmarks, on the contrary you can notice that single-thread +# performance for parallelizable modes is ~1.5% worse for largest +# block sizes [though few percent better for not so long ones]. All +# this based on suggestions from David Miller. + +sub asm_init { # to be called with @ARGV as argument + for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); } + if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; } + else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; } +} + +# unified interface +my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5)); +# local variables +my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7)); + +sub alg_cbc_encrypt_implement { +my ($alg,$bits) = @_; + +$::code.=<<___; +.globl ${alg}${bits}_t4_cbc_encrypt +.align 32 +${alg}${bits}_t4_cbc_encrypt: + save %sp, -$::frame, %sp + sub $inp, $out, $blk_init ! $inp!=$out +___ +$::code.=<<___ if (!$::evp); + andcc $ivec, 7, $ivoff + alignaddr $ivec, %g0, $ivec + + ldd [$ivec + 0], %f0 ! load ivec + bz,pt %icc, 1f + ldd [$ivec + 8], %f2 + ldd [$ivec + 16], %f4 + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 +1: +___ +$::code.=<<___ if ($::evp); + ld [$ivec + 0], %f0 + ld [$ivec + 4], %f1 + ld [$ivec + 8], %f2 + ld [$ivec + 12], %f3 +___ +$::code.=<<___; + prefetch [$inp], 20 + prefetch [$inp + 63], 20 + call _${alg}${bits}_load_enckey + and $inp, 7, $ileft + andn $inp, 7, $inp + sll $ileft, 3, $ileft + mov 64, $iright + mov 0xff, $omask + sub $iright, $ileft, $iright + and $out, 7, $ooff + cmp $len, 127 + movrnz $ooff, 0, $blk_init ! if ( $out&7 || + movleu $::size_t_cc, 0, $blk_init ! $len<128 || + brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out) + srl $omask, $ooff, $omask + + alignaddrl $out, %g0, $out + srlx $len, 4, $len + prefetch [$out], 22 + +.L${bits}_cbc_enc_loop: + ldx [$inp + 0], %o0 + brz,pt $ileft, 4f + ldx [$inp + 8], %o1 + + ldx [$inp + 16], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 +4: + xor %g4, %o0, %o0 ! ^= rk[0] + xor %g5, %o1, %o1 + movxtod %o0, %f12 + movxtod %o1, %f14 + + fxor %f12, %f0, %f0 ! ^= ivec + fxor %f14, %f2, %f2 + prefetch [$out + 63], 22 + prefetch [$inp + 16+63], 20 + call _${alg}${bits}_encrypt_1x + add $inp, 16, $inp + + brnz,pn $ooff, 2f + sub $len, 1, $len + + std %f0, [$out + 0] + std %f2, [$out + 8] + brnz,pt $len, .L${bits}_cbc_enc_loop + add $out, 16, $out +___ +$::code.=<<___ if ($::evp); + st %f0, [$ivec + 0] + st %f1, [$ivec + 4] + st %f2, [$ivec + 8] + st %f3, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, 3f + nop + + std %f0, [$ivec + 0] ! write out ivec + std %f2, [$ivec + 8] +___ +$::code.=<<___; + ret + restore + +.align 16 +2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard + ! and ~3x deterioration + ! in inp==out case + faligndata %f0, %f0, %f4 ! handle unaligned output + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + + stda %f4, [$out + $omask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $omask, $omask + stda %f8, [$out + $omask]0xc0 ! partial store + + brnz,pt $len, .L${bits}_cbc_enc_loop+4 + orn %g0, $omask, $omask +___ +$::code.=<<___ if ($::evp); + st %f0, [$ivec + 0] + st %f1, [$ivec + 4] + st %f2, [$ivec + 8] + st %f3, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, 3f + nop + + std %f0, [$ivec + 0] ! write out ivec + std %f2, [$ivec + 8] + ret + restore + +.align 16 +3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec + mov 0xff, $omask + srl $omask, $ivoff, $omask + faligndata %f0, %f0, %f4 + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + stda %f4, [$ivec + $omask]0xc0 + std %f6, [$ivec + 8] + add $ivec, 16, $ivec + orn %g0, $omask, $omask + stda %f8, [$ivec + $omask]0xc0 +___ +$::code.=<<___; + ret + restore + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.align 32 +.L${bits}cbc_enc_blk: + add $out, $len, $blk_init + and $blk_init, 63, $blk_init ! tail + sub $len, $blk_init, $len + add $blk_init, 15, $blk_init ! round up to 16n + srlx $len, 4, $len + srl $blk_init, 4, $blk_init + +.L${bits}_cbc_enc_blk_loop: + ldx [$inp + 0], %o0 + brz,pt $ileft, 5f + ldx [$inp + 8], %o1 + + ldx [$inp + 16], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 +5: + xor %g4, %o0, %o0 ! ^= rk[0] + xor %g5, %o1, %o1 + movxtod %o0, %f12 + movxtod %o1, %f14 + + fxor %f12, %f0, %f0 ! ^= ivec + fxor %f14, %f2, %f2 + prefetch [$inp + 16+63], 20 + call _${alg}${bits}_encrypt_1x + add $inp, 16, $inp + sub $len, 1, $len + + stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + brnz,pt $len, .L${bits}_cbc_enc_blk_loop + add $out, 8, $out + + membar #StoreLoad|#StoreStore + brnz,pt $blk_init, .L${bits}_cbc_enc_loop + mov $blk_init, $len +___ +$::code.=<<___ if ($::evp); + st %f0, [$ivec + 0] + st %f1, [$ivec + 4] + st %f2, [$ivec + 8] + st %f3, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, 3b + nop + + std %f0, [$ivec + 0] ! write out ivec + std %f2, [$ivec + 8] +___ +$::code.=<<___; + ret + restore +.type ${alg}${bits}_t4_cbc_encrypt,#function +.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt +___ +} + +sub alg_cbc_decrypt_implement { +my ($alg,$bits) = @_; + +$::code.=<<___; +.globl ${alg}${bits}_t4_cbc_decrypt +.align 32 +${alg}${bits}_t4_cbc_decrypt: + save %sp, -$::frame, %sp + sub $inp, $out, $blk_init ! $inp!=$out +___ +$::code.=<<___ if (!$::evp); + andcc $ivec, 7, $ivoff + alignaddr $ivec, %g0, $ivec + + ldd [$ivec + 0], %f12 ! load ivec + bz,pt %icc, 1f + ldd [$ivec + 8], %f14 + ldd [$ivec + 16], %f0 + faligndata %f12, %f14, %f12 + faligndata %f14, %f0, %f14 +1: +___ +$::code.=<<___ if ($::evp); + ld [$ivec + 0], %f12 ! load ivec + ld [$ivec + 4], %f13 + ld [$ivec + 8], %f14 + ld [$ivec + 12], %f15 +___ +$::code.=<<___; + prefetch [$inp], 20 + prefetch [$inp + 63], 20 + call _${alg}${bits}_load_deckey + and $inp, 7, $ileft + andn $inp, 7, $inp + sll $ileft, 3, $ileft + mov 64, $iright + mov 0xff, $omask + sub $iright, $ileft, $iright + and $out, 7, $ooff + cmp $len, 255 + movrnz $ooff, 0, $blk_init ! if ( $out&7 || + movleu $::size_t_cc, 0, $blk_init ! $len<256 || + brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out) + srl $omask, $ooff, $omask + + andcc $len, 16, %g0 ! is number of blocks even? + srlx $len, 4, $len + alignaddrl $out, %g0, $out + bz %icc, .L${bits}_cbc_dec_loop2x + prefetch [$out], 22 +.L${bits}_cbc_dec_loop: + ldx [$inp + 0], %o0 + brz,pt $ileft, 4f + ldx [$inp + 8], %o1 + + ldx [$inp + 16], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 +4: + xor %g4, %o0, %o2 ! ^= rk[0] + xor %g5, %o1, %o3 + movxtod %o2, %f0 + movxtod %o3, %f2 + + prefetch [$out + 63], 22 + prefetch [$inp + 16+63], 20 + call _${alg}${bits}_decrypt_1x + add $inp, 16, $inp + + fxor %f12, %f0, %f0 ! ^= ivec + fxor %f14, %f2, %f2 + movxtod %o0, %f12 + movxtod %o1, %f14 + + brnz,pn $ooff, 2f + sub $len, 1, $len + + std %f0, [$out + 0] + std %f2, [$out + 8] + brnz,pt $len, .L${bits}_cbc_dec_loop2x + add $out, 16, $out +___ +$::code.=<<___ if ($::evp); + st %f12, [$ivec + 0] + st %f13, [$ivec + 4] + st %f14, [$ivec + 8] + st %f15, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec + nop + + std %f12, [$ivec + 0] ! write out ivec + std %f14, [$ivec + 8] +___ +$::code.=<<___; + ret + restore + +.align 16 +2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard + ! and ~3x deterioration + ! in inp==out case + faligndata %f0, %f0, %f4 ! handle unaligned output + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + + stda %f4, [$out + $omask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $omask, $omask + stda %f8, [$out + $omask]0xc0 ! partial store + + brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 + orn %g0, $omask, $omask +___ +$::code.=<<___ if ($::evp); + st %f12, [$ivec + 0] + st %f13, [$ivec + 4] + st %f14, [$ivec + 8] + st %f15, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec + nop + + std %f12, [$ivec + 0] ! write out ivec + std %f14, [$ivec + 8] +___ +$::code.=<<___; + ret + restore + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.align 32 +.L${bits}_cbc_dec_loop2x: + ldx [$inp + 0], %o0 + ldx [$inp + 8], %o1 + ldx [$inp + 16], %o2 + brz,pt $ileft, 4f + ldx [$inp + 24], %o3 + + ldx [$inp + 32], %o4 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + or %g1, %o0, %o0 + sllx %o1, $ileft, %o1 + srlx %o2, $iright, %g1 + or %g1, %o1, %o1 + sllx %o2, $ileft, %o2 + srlx %o3, $iright, %g1 + or %g1, %o2, %o2 + sllx %o3, $ileft, %o3 + srlx %o4, $iright, %o4 + or %o4, %o3, %o3 +4: + xor %g4, %o0, %o4 ! ^= rk[0] + xor %g5, %o1, %o5 + movxtod %o4, %f0 + movxtod %o5, %f2 + xor %g4, %o2, %o4 + xor %g5, %o3, %o5 + movxtod %o4, %f4 + movxtod %o5, %f6 + + prefetch [$out + 63], 22 + prefetch [$inp + 32+63], 20 + call _${alg}${bits}_decrypt_2x + add $inp, 32, $inp + + movxtod %o0, %f8 + movxtod %o1, %f10 + fxor %f12, %f0, %f0 ! ^= ivec + fxor %f14, %f2, %f2 + movxtod %o2, %f12 + movxtod %o3, %f14 + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + + brnz,pn $ooff, 2f + sub $len, 2, $len + + std %f0, [$out + 0] + std %f2, [$out + 8] + std %f4, [$out + 16] + std %f6, [$out + 24] + brnz,pt $len, .L${bits}_cbc_dec_loop2x + add $out, 32, $out +___ +$::code.=<<___ if ($::evp); + st %f12, [$ivec + 0] + st %f13, [$ivec + 4] + st %f14, [$ivec + 8] + st %f15, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec + nop + + std %f12, [$ivec + 0] ! write out ivec + std %f14, [$ivec + 8] +___ +$::code.=<<___; + ret + restore + +.align 16 +2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard + ! and ~3x deterioration + ! in inp==out case + faligndata %f0, %f0, %f8 ! handle unaligned output + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 + faligndata %f4, %f6, %f4 + faligndata %f6, %f6, %f6 + stda %f8, [$out + $omask]0xc0 ! partial store + std %f0, [$out + 8] + std %f2, [$out + 16] + std %f4, [$out + 24] + add $out, 32, $out + orn %g0, $omask, $omask + stda %f6, [$out + $omask]0xc0 ! partial store + + brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 + orn %g0, $omask, $omask +___ +$::code.=<<___ if ($::evp); + st %f12, [$ivec + 0] + st %f13, [$ivec + 4] + st %f14, [$ivec + 8] + st %f15, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec + nop + + std %f12, [$ivec + 0] ! write out ivec + std %f14, [$ivec + 8] + ret + restore + +.align 16 +.L${bits}_cbc_dec_unaligned_ivec: + alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec + mov 0xff, $omask + srl $omask, $ivoff, $omask + faligndata %f12, %f12, %f0 + faligndata %f12, %f14, %f2 + faligndata %f14, %f14, %f4 + stda %f0, [$ivec + $omask]0xc0 + std %f2, [$ivec + 8] + add $ivec, 16, $ivec + orn %g0, $omask, $omask + stda %f4, [$ivec + $omask]0xc0 +___ +$::code.=<<___; + ret + restore + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.align 32 +.L${bits}cbc_dec_blk: + add $out, $len, $blk_init + and $blk_init, 63, $blk_init ! tail + sub $len, $blk_init, $len + add $blk_init, 15, $blk_init ! round up to 16n + srlx $len, 4, $len + srl $blk_init, 4, $blk_init + sub $len, 1, $len + add $blk_init, 1, $blk_init + +.L${bits}_cbc_dec_blk_loop2x: + ldx [$inp + 0], %o0 + ldx [$inp + 8], %o1 + ldx [$inp + 16], %o2 + brz,pt $ileft, 5f + ldx [$inp + 24], %o3 + + ldx [$inp + 32], %o4 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + or %g1, %o0, %o0 + sllx %o1, $ileft, %o1 + srlx %o2, $iright, %g1 + or %g1, %o1, %o1 + sllx %o2, $ileft, %o2 + srlx %o3, $iright, %g1 + or %g1, %o2, %o2 + sllx %o3, $ileft, %o3 + srlx %o4, $iright, %o4 + or %o4, %o3, %o3 +5: + xor %g4, %o0, %o4 ! ^= rk[0] + xor %g5, %o1, %o5 + movxtod %o4, %f0 + movxtod %o5, %f2 + xor %g4, %o2, %o4 + xor %g5, %o3, %o5 + movxtod %o4, %f4 + movxtod %o5, %f6 + + prefetch [$inp + 32+63], 20 + call _${alg}${bits}_decrypt_2x + add $inp, 32, $inp + subcc $len, 2, $len + + movxtod %o0, %f8 + movxtod %o1, %f10 + fxor %f12, %f0, %f0 ! ^= ivec + fxor %f14, %f2, %f2 + movxtod %o2, %f12 + movxtod %o3, %f14 + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + + stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x + add $out, 8, $out + + add $blk_init, $len, $len + andcc $len, 1, %g0 ! is number of blocks even? + membar #StoreLoad|#StoreStore + bnz,pt %icc, .L${bits}_cbc_dec_loop + srl $len, 0, $len + brnz,pn $len, .L${bits}_cbc_dec_loop2x + nop +___ +$::code.=<<___ if ($::evp); + st %f12, [$ivec + 0] ! write out ivec + st %f13, [$ivec + 4] + st %f14, [$ivec + 8] + st %f15, [$ivec + 12] +___ +$::code.=<<___ if (!$::evp); + brnz,pn $ivoff, 3b + nop + + std %f12, [$ivec + 0] ! write out ivec + std %f14, [$ivec + 8] +___ +$::code.=<<___; + ret + restore +.type ${alg}${bits}_t4_cbc_decrypt,#function +.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt +___ +} + +sub alg_ctr32_implement { +my ($alg,$bits) = @_; + +$::code.=<<___; +.globl ${alg}${bits}_t4_ctr32_encrypt +.align 32 +${alg}${bits}_t4_ctr32_encrypt: + save %sp, -$::frame, %sp + + prefetch [$inp], 20 + prefetch [$inp + 63], 20 + call _${alg}${bits}_load_enckey + sllx $len, 4, $len + + ld [$ivec + 0], %l4 ! counter + ld [$ivec + 4], %l5 + ld [$ivec + 8], %l6 + ld [$ivec + 12], %l7 + + sllx %l4, 32, %o5 + or %l5, %o5, %o5 + sllx %l6, 32, %g1 + xor %o5, %g4, %g4 ! ^= rk[0] + xor %g1, %g5, %g5 + movxtod %g4, %f14 ! most significant 64 bits + + sub $inp, $out, $blk_init ! $inp!=$out + and $inp, 7, $ileft + andn $inp, 7, $inp + sll $ileft, 3, $ileft + mov 64, $iright + mov 0xff, $omask + sub $iright, $ileft, $iright + and $out, 7, $ooff + cmp $len, 255 + movrnz $ooff, 0, $blk_init ! if ( $out&7 || + movleu $::size_t_cc, 0, $blk_init ! $len<256 || + brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out) + srl $omask, $ooff, $omask + + andcc $len, 16, %g0 ! is number of blocks even? + alignaddrl $out, %g0, $out + bz %icc, .L${bits}_ctr32_loop2x + srlx $len, 4, $len +.L${bits}_ctr32_loop: + ldx [$inp + 0], %o0 + brz,pt $ileft, 4f + ldx [$inp + 8], %o1 + + ldx [$inp + 16], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 +4: + xor %g5, %l7, %g1 ! ^= rk[0] + add %l7, 1, %l7 + movxtod %g1, %f2 + srl %l7, 0, %l7 ! clruw + prefetch [$out + 63], 22 + prefetch [$inp + 16+63], 20 +___ +$::code.=<<___ if ($alg eq "aes"); + aes_eround01 %f16, %f14, %f2, %f4 + aes_eround23 %f18, %f14, %f2, %f2 +___ +$::code.=<<___ if ($alg eq "cmll"); + camellia_f %f16, %f2, %f14, %f2 + camellia_f %f18, %f14, %f2, %f0 +___ +$::code.=<<___; + call _${alg}${bits}_encrypt_1x+8 + add $inp, 16, $inp + + movxtod %o0, %f10 + movxtod %o1, %f12 + fxor %f10, %f0, %f0 ! ^= inp + fxor %f12, %f2, %f2 + + brnz,pn $ooff, 2f + sub $len, 1, $len + + std %f0, [$out + 0] + std %f2, [$out + 8] + brnz,pt $len, .L${bits}_ctr32_loop2x + add $out, 16, $out + + ret + restore + +.align 16 +2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard + ! and ~3x deterioration + ! in inp==out case + faligndata %f0, %f0, %f4 ! handle unaligned output + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + stda %f4, [$out + $omask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $omask, $omask + stda %f8, [$out + $omask]0xc0 ! partial store + + brnz,pt $len, .L${bits}_ctr32_loop2x+4 + orn %g0, $omask, $omask + + ret + restore + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.align 32 +.L${bits}_ctr32_loop2x: + ldx [$inp + 0], %o0 + ldx [$inp + 8], %o1 + ldx [$inp + 16], %o2 + brz,pt $ileft, 4f + ldx [$inp + 24], %o3 + + ldx [$inp + 32], %o4 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + or %g1, %o0, %o0 + sllx %o1, $ileft, %o1 + srlx %o2, $iright, %g1 + or %g1, %o1, %o1 + sllx %o2, $ileft, %o2 + srlx %o3, $iright, %g1 + or %g1, %o2, %o2 + sllx %o3, $ileft, %o3 + srlx %o4, $iright, %o4 + or %o4, %o3, %o3 +4: + xor %g5, %l7, %g1 ! ^= rk[0] + add %l7, 1, %l7 + movxtod %g1, %f2 + srl %l7, 0, %l7 ! clruw + xor %g5, %l7, %g1 + add %l7, 1, %l7 + movxtod %g1, %f6 + srl %l7, 0, %l7 ! clruw + prefetch [$out + 63], 22 + prefetch [$inp + 32+63], 20 +___ +$::code.=<<___ if ($alg eq "aes"); + aes_eround01 %f16, %f14, %f2, %f8 + aes_eround23 %f18, %f14, %f2, %f2 + aes_eround01 %f16, %f14, %f6, %f10 + aes_eround23 %f18, %f14, %f6, %f6 +___ +$::code.=<<___ if ($alg eq "cmll"); + camellia_f %f16, %f2, %f14, %f2 + camellia_f %f16, %f6, %f14, %f6 + camellia_f %f18, %f14, %f2, %f0 + camellia_f %f18, %f14, %f6, %f4 +___ +$::code.=<<___; + call _${alg}${bits}_encrypt_2x+16 + add $inp, 32, $inp + + movxtod %o0, %f8 + movxtod %o1, %f10 + movxtod %o2, %f12 + fxor %f8, %f0, %f0 ! ^= inp + movxtod %o3, %f8 + fxor %f10, %f2, %f2 + fxor %f12, %f4, %f4 + fxor %f8, %f6, %f6 + + brnz,pn $ooff, 2f + sub $len, 2, $len + + std %f0, [$out + 0] + std %f2, [$out + 8] + std %f4, [$out + 16] + std %f6, [$out + 24] + brnz,pt $len, .L${bits}_ctr32_loop2x + add $out, 32, $out + + ret + restore + +.align 16 +2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard + ! and ~3x deterioration + ! in inp==out case + faligndata %f0, %f0, %f8 ! handle unaligned output + faligndata %f0, %f2, %f0 + faligndata %f2, %f4, %f2 + faligndata %f4, %f6, %f4 + faligndata %f6, %f6, %f6 + + stda %f8, [$out + $omask]0xc0 ! partial store + std %f0, [$out + 8] + std %f2, [$out + 16] + std %f4, [$out + 24] + add $out, 32, $out + orn %g0, $omask, $omask + stda %f6, [$out + $omask]0xc0 ! partial store + + brnz,pt $len, .L${bits}_ctr32_loop2x+4 + orn %g0, $omask, $omask + + ret + restore + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.align 32 +.L${bits}_ctr32_blk: + add $out, $len, $blk_init + and $blk_init, 63, $blk_init ! tail + sub $len, $blk_init, $len + add $blk_init, 15, $blk_init ! round up to 16n + srlx $len, 4, $len + srl $blk_init, 4, $blk_init + sub $len, 1, $len + add $blk_init, 1, $blk_init + +.L${bits}_ctr32_blk_loop2x: + ldx [$inp + 0], %o0 + ldx [$inp + 8], %o1 + ldx [$inp + 16], %o2 + brz,pt $ileft, 5f + ldx [$inp + 24], %o3 + + ldx [$inp + 32], %o4 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + or %g1, %o0, %o0 + sllx %o1, $ileft, %o1 + srlx %o2, $iright, %g1 + or %g1, %o1, %o1 + sllx %o2, $ileft, %o2 + srlx %o3, $iright, %g1 + or %g1, %o2, %o2 + sllx %o3, $ileft, %o3 + srlx %o4, $iright, %o4 + or %o4, %o3, %o3 +5: + xor %g5, %l7, %g1 ! ^= rk[0] + add %l7, 1, %l7 + movxtod %g1, %f2 + srl %l7, 0, %l7 ! clruw + xor %g5, %l7, %g1 + add %l7, 1, %l7 + movxtod %g1, %f6 + srl %l7, 0, %l7 ! clruw + prefetch [$inp + 32+63], 20 +___ +$::code.=<<___ if ($alg eq "aes"); + aes_eround01 %f16, %f14, %f2, %f8 + aes_eround23 %f18, %f14, %f2, %f2 + aes_eround01 %f16, %f14, %f6, %f10 + aes_eround23 %f18, %f14, %f6, %f6 +___ +$::code.=<<___ if ($alg eq "cmll"); + camellia_f %f16, %f2, %f14, %f2 + camellia_f %f16, %f6, %f14, %f6 + camellia_f %f18, %f14, %f2, %f0 + camellia_f %f18, %f14, %f6, %f4 +___ +$::code.=<<___; + call _${alg}${bits}_encrypt_2x+16 + add $inp, 32, $inp + subcc $len, 2, $len + + movxtod %o0, %f8 + movxtod %o1, %f10 + movxtod %o2, %f12 + fxor %f8, %f0, %f0 ! ^= inp + movxtod %o3, %f8 + fxor %f10, %f2, %f2 + fxor %f12, %f4, %f4 + fxor %f8, %f6, %f6 + + stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x + add $out, 8, $out + + add $blk_init, $len, $len + andcc $len, 1, %g0 ! is number of blocks even? + membar #StoreLoad|#StoreStore + bnz,pt %icc, .L${bits}_ctr32_loop + srl $len, 0, $len + brnz,pn $len, .L${bits}_ctr32_loop2x + nop + + ret + restore +.type ${alg}${bits}_t4_ctr32_encrypt,#function +.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt +___ +} + +sub alg_xts_implement { +my ($alg,$bits,$dir) = @_; +my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5)); +my $rem=$ivec; + +$::code.=<<___; +.globl ${alg}${bits}_t4_xts_${dir}crypt +.align 32 +${alg}${bits}_t4_xts_${dir}crypt: + save %sp, -$::frame-16, %sp + + mov $ivec, %o0 + add %fp, $::bias-16, %o1 + call ${alg}_t4_encrypt + mov $key2, %o2 + + add %fp, $::bias-16, %l7 + ldxa [%l7]0x88, %g2 + add %fp, $::bias-8, %l7 + ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak + + sethi %hi(0x76543210), %l7 + or %l7, %lo(0x76543210), %l7 + bmask %l7, %g0, %g0 ! byte swap mask + + prefetch [$inp], 20 + prefetch [$inp + 63], 20 + call _${alg}${bits}_load_${dir}ckey + and $len, 15, $rem + and $len, -16, $len +___ +$code.=<<___ if ($dir eq "de"); + mov 0, %l7 + movrnz $rem, 16, %l7 + sub $len, %l7, $len +___ +$code.=<<___; + + sub $inp, $out, $blk_init ! $inp!=$out + and $inp, 7, $ileft + andn $inp, 7, $inp + sll $ileft, 3, $ileft + mov 64, $iright + mov 0xff, $omask + sub $iright, $ileft, $iright + and $out, 7, $ooff + cmp $len, 255 + movrnz $ooff, 0, $blk_init ! if ( $out&7 || + movleu $::size_t_cc, 0, $blk_init ! $len<256 || + brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out) + srl $omask, $ooff, $omask + + andcc $len, 16, %g0 ! is number of blocks even? +___ +$code.=<<___ if ($dir eq "de"); + brz,pn $len, .L${bits}_xts_${dir}steal +___ +$code.=<<___; + alignaddrl $out, %g0, $out + bz %icc, .L${bits}_xts_${dir}loop2x + srlx $len, 4, $len +.L${bits}_xts_${dir}loop: + ldx [$inp + 0], %o0 + brz,pt $ileft, 4f + ldx [$inp + 8], %o1 + + ldx [$inp + 16], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 +4: + movxtod %g2, %f12 + movxtod %g3, %f14 + bshuffle %f12, %f12, %f12 + bshuffle %f14, %f14, %f14 + + xor %g4, %o0, %o0 ! ^= rk[0] + xor %g5, %o1, %o1 + movxtod %o0, %f0 + movxtod %o1, %f2 + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + + prefetch [$out + 63], 22 + prefetch [$inp + 16+63], 20 + call _${alg}${bits}_${dir}crypt_1x + add $inp, 16, $inp + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + + srax %g3, 63, %l7 ! next tweak value + addcc %g2, %g2, %g2 + and %l7, 0x87, %l7 + addxc %g3, %g3, %g3 + xor %l7, %g2, %g2 + + brnz,pn $ooff, 2f + sub $len, 1, $len + + std %f0, [$out + 0] + std %f2, [$out + 8] + brnz,pt $len, .L${bits}_xts_${dir}loop2x + add $out, 16, $out + + brnz,pn $rem, .L${bits}_xts_${dir}steal + nop + + ret + restore + +.align 16 +2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard + ! and ~3x deterioration + ! in inp==out case + faligndata %f0, %f0, %f4 ! handle unaligned output + faligndata %f0, %f2, %f6 + faligndata %f2, %f2, %f8 + stda %f4, [$out + $omask]0xc0 ! partial store + std %f6, [$out + 8] + add $out, 16, $out + orn %g0, $omask, $omask + stda %f8, [$out + $omask]0xc0 ! partial store + + brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 + orn %g0, $omask, $omask + + brnz,pn $rem, .L${bits}_xts_${dir}steal + nop + + ret + restore + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.align 32 +.L${bits}_xts_${dir}loop2x: + ldx [$inp + 0], %o0 + ldx [$inp + 8], %o1 + ldx [$inp + 16], %o2 + brz,pt $ileft, 4f + ldx [$inp + 24], %o3 + + ldx [$inp + 32], %o4 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + or %g1, %o0, %o0 + sllx %o1, $ileft, %o1 + srlx %o2, $iright, %g1 + or %g1, %o1, %o1 + sllx %o2, $ileft, %o2 + srlx %o3, $iright, %g1 + or %g1, %o2, %o2 + sllx %o3, $ileft, %o3 + srlx %o4, $iright, %o4 + or %o4, %o3, %o3 +4: + movxtod %g2, %f12 + movxtod %g3, %f14 + bshuffle %f12, %f12, %f12 + bshuffle %f14, %f14, %f14 + + srax %g3, 63, %l7 ! next tweak value + addcc %g2, %g2, %g2 + and %l7, 0x87, %l7 + addxc %g3, %g3, %g3 + xor %l7, %g2, %g2 + + movxtod %g2, %f8 + movxtod %g3, %f10 + bshuffle %f8, %f8, %f8 + bshuffle %f10, %f10, %f10 + + xor %g4, %o0, %o0 ! ^= rk[0] + xor %g5, %o1, %o1 + xor %g4, %o2, %o2 ! ^= rk[0] + xor %g5, %o3, %o3 + movxtod %o0, %f0 + movxtod %o1, %f2 + movxtod %o2, %f4 + movxtod %o3, %f6 + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + fxor %f8, %f4, %f4 ! ^= tweak[0] + fxor %f10, %f6, %f6 + + prefetch [$out + 63], 22 + prefetch [$inp + 32+63], 20 + call _${alg}${bits}_${dir}crypt_2x + add $inp, 32, $inp + + movxtod %g2, %f8 + movxtod %g3, %f10 + + srax %g3, 63, %l7 ! next tweak value + addcc %g2, %g2, %g2 + and %l7, 0x87, %l7 + addxc %g3, %g3, %g3 + xor %l7, %g2, %g2 + + bshuffle %f8, %f8, %f8 + bshuffle %f10, %f10, %f10 + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + + brnz,pn $ooff, 2f + sub $len, 2, $len + + std %f0, [$out + 0] + std %f2, [$out + 8] + std %f4, [$out + 16] + std %f6, [$out + 24] + brnz,pt $len, .L${bits}_xts_${dir}loop2x + add $out, 32, $out + + fsrc2 %f4, %f0 + fsrc2 %f6, %f2 + brnz,pn $rem, .L${bits}_xts_${dir}steal + nop + + ret + restore + +.align 16 +2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard + ! and ~3x deterioration + ! in inp==out case + faligndata %f0, %f0, %f8 ! handle unaligned output + faligndata %f0, %f2, %f10 + faligndata %f2, %f4, %f12 + faligndata %f4, %f6, %f14 + faligndata %f6, %f6, %f0 + + stda %f8, [$out + $omask]0xc0 ! partial store + std %f10, [$out + 8] + std %f12, [$out + 16] + std %f14, [$out + 24] + add $out, 32, $out + orn %g0, $omask, $omask + stda %f0, [$out + $omask]0xc0 ! partial store + + brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 + orn %g0, $omask, $omask + + fsrc2 %f4, %f0 + fsrc2 %f6, %f2 + brnz,pn $rem, .L${bits}_xts_${dir}steal + nop + + ret + restore + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.align 32 +.L${bits}_xts_${dir}blk: + add $out, $len, $blk_init + and $blk_init, 63, $blk_init ! tail + sub $len, $blk_init, $len + add $blk_init, 15, $blk_init ! round up to 16n + srlx $len, 4, $len + srl $blk_init, 4, $blk_init + sub $len, 1, $len + add $blk_init, 1, $blk_init + +.L${bits}_xts_${dir}blk2x: + ldx [$inp + 0], %o0 + ldx [$inp + 8], %o1 + ldx [$inp + 16], %o2 + brz,pt $ileft, 5f + ldx [$inp + 24], %o3 + + ldx [$inp + 32], %o4 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + or %g1, %o0, %o0 + sllx %o1, $ileft, %o1 + srlx %o2, $iright, %g1 + or %g1, %o1, %o1 + sllx %o2, $ileft, %o2 + srlx %o3, $iright, %g1 + or %g1, %o2, %o2 + sllx %o3, $ileft, %o3 + srlx %o4, $iright, %o4 + or %o4, %o3, %o3 +5: + movxtod %g2, %f12 + movxtod %g3, %f14 + bshuffle %f12, %f12, %f12 + bshuffle %f14, %f14, %f14 + + srax %g3, 63, %l7 ! next tweak value + addcc %g2, %g2, %g2 + and %l7, 0x87, %l7 + addxc %g3, %g3, %g3 + xor %l7, %g2, %g2 + + movxtod %g2, %f8 + movxtod %g3, %f10 + bshuffle %f8, %f8, %f8 + bshuffle %f10, %f10, %f10 + + xor %g4, %o0, %o0 ! ^= rk[0] + xor %g5, %o1, %o1 + xor %g4, %o2, %o2 ! ^= rk[0] + xor %g5, %o3, %o3 + movxtod %o0, %f0 + movxtod %o1, %f2 + movxtod %o2, %f4 + movxtod %o3, %f6 + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + fxor %f8, %f4, %f4 ! ^= tweak[0] + fxor %f10, %f6, %f6 + + prefetch [$inp + 32+63], 20 + call _${alg}${bits}_${dir}crypt_2x + add $inp, 32, $inp + + movxtod %g2, %f8 + movxtod %g3, %f10 + + srax %g3, 63, %l7 ! next tweak value + addcc %g2, %g2, %g2 + and %l7, 0x87, %l7 + addxc %g3, %g3, %g3 + xor %l7, %g2, %g2 + + bshuffle %f8, %f8, %f8 + bshuffle %f10, %f10, %f10 + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + fxor %f8, %f4, %f4 + fxor %f10, %f6, %f6 + + stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + add $out, 8, $out + stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific + bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x + add $out, 8, $out + + add $blk_init, $len, $len + andcc $len, 1, %g0 ! is number of blocks even? + membar #StoreLoad|#StoreStore + bnz,pt %icc, .L${bits}_xts_${dir}loop + srl $len, 0, $len + brnz,pn $len, .L${bits}_xts_${dir}loop2x + nop + + fsrc2 %f4, %f0 + fsrc2 %f6, %f2 + brnz,pn $rem, .L${bits}_xts_${dir}steal + nop + + ret + restore +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +___ +$code.=<<___ if ($dir eq "en"); +.align 32 +.L${bits}_xts_${dir}steal: + std %f0, [%fp + $::bias-16] ! copy of output + std %f2, [%fp + $::bias-8] + + srl $ileft, 3, $ileft + add %fp, $::bias-16, %l7 + add $inp, $ileft, $inp ! original $inp+$len&-15 + add $out, $ooff, $out ! original $out+$len&-15 + mov 0, $ileft + nop ! align + +.L${bits}_xts_${dir}stealing: + ldub [$inp + $ileft], %o0 + ldub [%l7 + $ileft], %o1 + dec $rem + stb %o0, [%l7 + $ileft] + stb %o1, [$out + $ileft] + brnz $rem, .L${bits}_xts_${dir}stealing + inc $ileft + + mov %l7, $inp + sub $out, 16, $out + mov 0, $ileft + sub $out, $ooff, $out + ba .L${bits}_xts_${dir}loop ! one more time + mov 1, $len ! $rem is 0 +___ +$code.=<<___ if ($dir eq "de"); +.align 32 +.L${bits}_xts_${dir}steal: + ldx [$inp + 0], %o0 + brz,pt $ileft, 8f + ldx [$inp + 8], %o1 + + ldx [$inp + 16], %o2 + sllx %o0, $ileft, %o0 + srlx %o1, $iright, %g1 + sllx %o1, $ileft, %o1 + or %g1, %o0, %o0 + srlx %o2, $iright, %o2 + or %o2, %o1, %o1 +8: + srax %g3, 63, %l7 ! next tweak value + addcc %g2, %g2, %o2 + and %l7, 0x87, %l7 + addxc %g3, %g3, %o3 + xor %l7, %o2, %o2 + + movxtod %o2, %f12 + movxtod %o3, %f14 + bshuffle %f12, %f12, %f12 + bshuffle %f14, %f14, %f14 + + xor %g4, %o0, %o0 ! ^= rk[0] + xor %g5, %o1, %o1 + movxtod %o0, %f0 + movxtod %o1, %f2 + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + + call _${alg}${bits}_${dir}crypt_1x + add $inp, 16, $inp + + fxor %f12, %f0, %f0 ! ^= tweak[0] + fxor %f14, %f2, %f2 + + std %f0, [%fp + $::bias-16] + std %f2, [%fp + $::bias-8] + + srl $ileft, 3, $ileft + add %fp, $::bias-16, %l7 + add $inp, $ileft, $inp ! original $inp+$len&-15 + add $out, $ooff, $out ! original $out+$len&-15 + mov 0, $ileft + add $out, 16, $out + nop ! align + +.L${bits}_xts_${dir}stealing: + ldub [$inp + $ileft], %o0 + ldub [%l7 + $ileft], %o1 + dec $rem + stb %o0, [%l7 + $ileft] + stb %o1, [$out + $ileft] + brnz $rem, .L${bits}_xts_${dir}stealing + inc $ileft + + mov %l7, $inp + sub $out, 16, $out + mov 0, $ileft + sub $out, $ooff, $out + ba .L${bits}_xts_${dir}loop ! one more time + mov 1, $len ! $rem is 0 +___ +$code.=<<___; + ret + restore +.type ${alg}${bits}_t4_xts_${dir}crypt,#function +.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt +___ +} + +# Purpose of these subroutines is to explicitly encode VIS instructions, +# so that one can compile the module without having to specify VIS +# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# Idea is to reserve for option to produce "universal" binary and let +# programmer detect if current CPU is VIS capable at run-time. +sub unvis { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my ($ref,$opf); +my %visopf = ( "faligndata" => 0x048, + "bshuffle" => 0x04c, + "fnot2" => 0x066, + "fxor" => 0x06c, + "fsrc2" => 0x078 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unvis3 { +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); +my ($ref,$opf); +my %visopf = ( "addxc" => 0x011, + "addxccc" => 0x013, + "umulxhi" => 0x016, + "alignaddr" => 0x018, + "bmask" => 0x019, + "alignaddrl" => 0x01a ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if ($opf=$visopf{$mnemonic}) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%([goli])([0-9])/); + $_=$bias{$1}+$2; + } + + return sprintf ".word\t0x%08x !%s", + 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unaes_round { # 4-argument instructions +my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; +my ($ref,$opf); +my %aesopf = ( "aes_eround01" => 0, + "aes_eround23" => 1, + "aes_dround01" => 2, + "aes_dround23" => 3, + "aes_eround01_l"=> 4, + "aes_eround23_l"=> 5, + "aes_dround01_l"=> 6, + "aes_dround23_l"=> 7, + "aes_kexpand1" => 8 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; + + if (defined($opf=$aesopf{$mnemonic})) { + $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unaes_kexpand { # 3-argument instructions +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my ($ref,$opf); +my %aesopf = ( "aes_kexpand0" => 0x130, + "aes_kexpand2" => 0x131 ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if (defined($opf=$aesopf{$mnemonic})) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub uncamellia_f { # 4-argument instructions +my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; +my ($ref,$opf); + + $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; + + if (1) { + $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub uncamellia3 { # 3-argument instructions +my ($mnemonic,$rs1,$rs2,$rd)=@_; +my ($ref,$opf); +my %cmllopf = ( "camellia_fl" => 0x13c, + "camellia_fli" => 0x13d ); + + $ref = "$mnemonic\t$rs1,$rs2,$rd"; + + if (defined($opf=$cmllopf{$mnemonic})) { + foreach ($rs1,$rs2,$rd) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, + $ref; + } else { + return $ref; + } +} + +sub unmovxtox { # 2-argument instructions +my ($mnemonic,$rs,$rd)=@_; +my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 ); +my ($ref,$opf); +my %movxopf = ( "movdtox" => 0x110, + "movstouw" => 0x111, + "movstosw" => 0x113, + "movxtod" => 0x118, + "movwtos" => 0x119 ); + + $ref = "$mnemonic\t$rs,$rd"; + + if (defined($opf=$movxopf{$mnemonic})) { + foreach ($rs,$rd) { + return $ref if (!/%([fgoli])([0-9]{1,2})/); + $_=$bias{$1}+$2; + if ($2>=32) { + return $ref if ($2&1); + # re-encode for upper double register addressing + $_=($2|$2>>5)&31; + } + } + + return sprintf ".word\t0x%08x !%s", + 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs, + $ref; + } else { + return $ref; + } +} + +sub undes { +my ($mnemonic)=shift; +my @args=@_; +my ($ref,$opf); +my %desopf = ( "des_round" => 0b1001, + "des_ip" => 0b100110100, + "des_iip" => 0b100110101, + "des_kexpand" => 0b100110110 ); + + $ref = "$mnemonic\t".join(",",@_); + + if (defined($opf=$desopf{$mnemonic})) { # 4-arg + if ($mnemonic eq "des_round") { + foreach (@args[0..3]) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($1&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + return sprintf ".word\t0x%08x !%s", + 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25, + $ref; + } elsif ($mnemonic eq "des_kexpand") { # 3-arg + foreach (@args[0..2]) { + return $ref if (!/(%f)?([0-9]{1,2})/); + $_=$2; + if ($2>=32) { + return $ref if ($2&1); + # re-encode for upper double register addressing + $_=($2|$2>>5)&31; + } + } + return sprintf ".word\t0x%08x !%s", + 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25, + $ref; + } else { # 2-arg + foreach (@args[0..1]) { + return $ref if (!/%f([0-9]{1,2})/); + $_=$1; + if ($1>=32) { + return $ref if ($2&1); + # re-encode for upper double register addressing + $_=($1|$1>>5)&31; + } + } + return sprintf ".word\t0x%08x !%s", + 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25, + $ref; + } + } else { + return $ref; + } +} + +sub emit_assembler { + foreach (split("\n",$::code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go; + + s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ + &unaes_round($1,$2,$3,$4,$5) + /geo or + s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ + &unaes_kexpand($1,$2,$3,$4) + /geo or + s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ + &uncamellia_f($1,$2,$3,$4,$5) + /geo or + s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ + &uncamellia3($1,$2,$3,$4) + /geo or + s/\b(des_\w+)\s+(?%f[0-9]{1,2}),\s*(?[%fx0-9]+)(,\s*(?%f[0-9]{1,2})(,\s*(?%f[0-9]{1,2}))?)?/ + &undes($1,$+{rs1},$+{rs2},$+{rs3},$+{rs4}) + /geo or + s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/ + &unmovxtox($1,$2,$3) + /geo or + s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/ + &unmovxtox($1,$2,$3) + /geo or + s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ + &unvis($1,$2,$3,$4) + /geo or + s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ + &unvis3($1,$2,$3,$4) + /geo; + + print $_,"\n"; + } +} + +1; diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/patches/31_dtls_version.patch --- a/components/openssl/openssl-1.0.1/patches/31_dtls_version.patch Wed Jul 17 00:17:02 2013 -0700 +++ b/components/openssl/openssl-1.0.1/patches/31_dtls_version.patch Wed Jul 17 15:19:38 2013 -0700 @@ -8,4 +8,4 @@ + if (s->version >= TLS1_1_VERSION || s->version == DTLS1_BAD_VER) { /* These lengths are all public so we can test them in - * non-constant time. + * non-constant time. diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch --- a/components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch Wed Jul 17 00:17:02 2013 -0700 +++ b/components/openssl/openssl-1.0.1/patches/openssl-1.0.1e-t4-engine.sparc-patch Wed Jul 17 15:19:38 2013 -0700 @@ -11,10 +11,10 @@ my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; --my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; +-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; -my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; -+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void"; -+my $sparcv9_fips_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o t4_aes.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void"; ++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void"; ++my $sparcv9_fips_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o t4_des.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o t4_md5.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o t4_sha1.o t4_sha2.o:::::::ghash-sparcv9.o::void"; +my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o t4_des.o:t4_aes.o::t4_md5.o:t4_sha1.o t4_sha2.o:::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::"; @@ -49,9 +49,9 @@ GENERAL=Makefile #TEST=aestest.c TEST= -@@ -69,6 +73,10 @@ - aes-sparcv9.s: asm/aes-sparcv9.pl - $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ +@@ -72,6 +76,10 @@ + aest4-sparcv9.s: asm/aest4-sparcv9.pl + $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@ +t4_aes.o: asm/t4_aes.S + as $(ASFLAGSYF) -o $@ asm/t4_aes.S diff -r 70e041ba5b04 -r f7ee98f5749e components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch --- a/components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch Wed Jul 17 00:17:02 2013 -0700 +++ b/components/openssl/openssl-1.0.1/patches/openssl-t4-inline.sparc-patch Wed Jul 17 15:19:38 2013 -0700 @@ -1,5 +1,5 @@ # -# This file addds inline T4 instruction support to OpenSSL upstream code. +# This file adds inline T4 instruction support to OpenSSL upstream code. # Index: Configure =================================================================== @@ -11,7 +11,7 @@ my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void"; -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; -+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; ++my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void"; my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void"; my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::"; @@ -852,3 +852,664 @@ d=Time_F(STOP); print_result(D_SHA1,j,count,d); } +Index: openssl/crypto/aes/Makefile +=================================================================== +--- Makefile Thu May 2 13:42:37 2013 ++++ Makefile.orig Thu May 2 13:41:51 2013 +@@ -69,6 +69,9 @@ + aes-sparcv9.s: asm/aes-sparcv9.pl + $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@ + ++aest4-sparcv9.s: asm/aest4-sparcv9.pl ++ $(PERL) asm/aest4-sparcv9.pl $(CFLAGS) > $@ ++ + aes-ppc.s: asm/aes-ppc.pl + $(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@ + +Index: openssl/crypto/evp/e_aes.c +=================================================================== +--- e_aes.c Mon Feb 11 07:26:04 2013 ++++ e_aes.c.56 Thu May 2 14:26:35 2013 +@@ -56,13 +58,12 @@ + #include + #include + #include "evp_locl.h" +-#ifndef OPENSSL_FIPS + #include "modes_lcl.h" + #include + + typedef struct + { +- AES_KEY ks; ++ union { double align; AES_KEY ks; } ks; + block128_f block; + union { + cbc128_f cbc; +@@ -72,7 +73,7 @@ + + typedef struct + { +- AES_KEY ks; /* AES key schedule to use */ ++ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */ + int key_set; /* Set if key initialised */ + int iv_set; /* Set if an iv is set */ + GCM128_CONTEXT gcm; +@@ -86,7 +87,7 @@ + + typedef struct + { +- AES_KEY ks1, ks2; /* AES key schedules to use */ ++ union { double align; AES_KEY ks; } ks1, ks2; /* AES key schedules to use */ + XTS128_CONTEXT xts; + void (*stream)(const unsigned char *in, + unsigned char *out, size_t length, +@@ -96,7 +97,7 @@ + + typedef struct + { +- AES_KEY ks; /* AES key schedule to use */ ++ union { double align; AES_KEY ks; } ks; /* AES key schedule to use */ + int key_set; /* Set if key initialised */ + int iv_set; /* Set if an iv is set */ + int tag_set; /* Set if tag is valid */ +@@ -160,7 +161,7 @@ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(__INTEL__) ) + +-extern unsigned int OPENSSL_ia32cap_P[2]; ++extern unsigned int OPENSSL_ia32cap_P[]; + + #ifdef VPAES_ASM + #define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32))) +@@ -310,7 +311,7 @@ + return 1; + if (key) + { +- aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); ++ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks); + CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, + (block128_f)aesni_encrypt); + gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks; +@@ -355,19 +356,19 @@ + /* key_len is two AES keys */ + if (enc) + { +- aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); ++ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)aesni_encrypt; + xctx->stream = aesni_xts_encrypt; + } + else + { +- aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); ++ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)aesni_decrypt; + xctx->stream = aesni_xts_decrypt; + } + + aesni_set_encrypt_key(key + ctx->key_len/2, +- ctx->key_len * 4, &xctx->ks2); ++ ctx->key_len * 4, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f)aesni_encrypt; + + xctx->xts.key1 = &xctx->ks1; +@@ -394,7 +395,7 @@ + return 1; + if (key) + { +- aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); ++ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks); + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)aesni_encrypt); + cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks : +@@ -456,6 +457,379 @@ + const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ + { return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; } + ++#elif defined(AES_ASM) && (defined(__sparc) || defined(__sparc__)) ++ ++#include "sparc_arch.h" ++ ++extern unsigned int OPENSSL_sparcv9cap_P[]; ++ ++#define SPARC_AES_CAPABLE (OPENSSL_sparcv9cap_P[1] & CFR_AES) ++ ++void aes_t4_set_encrypt_key (const unsigned char *key, int bits, ++ AES_KEY *ks); ++void aes_t4_set_decrypt_key (const unsigned char *key, int bits, ++ AES_KEY *ks); ++void aes_t4_encrypt (const unsigned char *in, unsigned char *out, ++ const AES_KEY *key); ++void aes_t4_decrypt (const unsigned char *in, unsigned char *out, ++ const AES_KEY *key); ++/* ++ * Key-length specific subroutines were chosen for following reason. ++ * Each SPARC T4 core can execute up to 8 threads which share core's ++ * resources. Loading as much key material to registers allows to ++ * minimize references to shared memory interface, as well as amount ++ * of instructions in inner loops [much needed on T4]. But then having ++ * non-key-length specific routines would require conditional branches ++ * either in inner loops or on subroutines' entries. Former is hardly ++ * acceptable, while latter means code size increase to size occupied ++ * by multiple key-length specfic subroutines, so why fight? ++ */ ++void aes128_t4_cbc_encrypt (const unsigned char *in, unsigned char *out, ++ size_t len, const AES_KEY *key, ++ unsigned char *ivec); ++void aes128_t4_cbc_decrypt (const unsigned char *in, unsigned char *out, ++ size_t len, const AES_KEY *key, ++ unsigned char *ivec); ++void aes192_t4_cbc_encrypt (const unsigned char *in, unsigned char *out, ++ size_t len, const AES_KEY *key, ++ unsigned char *ivec); ++void aes192_t4_cbc_decrypt (const unsigned char *in, unsigned char *out, ++ size_t len, const AES_KEY *key, ++ unsigned char *ivec); ++void aes256_t4_cbc_encrypt (const unsigned char *in, unsigned char *out, ++ size_t len, const AES_KEY *key, ++ unsigned char *ivec); ++void aes256_t4_cbc_decrypt (const unsigned char *in, unsigned char *out, ++ size_t len, const AES_KEY *key, ++ unsigned char *ivec); ++void aes128_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out, ++ size_t blocks, const AES_KEY *key, ++ unsigned char *ivec); ++void aes192_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out, ++ size_t blocks, const AES_KEY *key, ++ unsigned char *ivec); ++void aes256_t4_ctr32_encrypt (const unsigned char *in, unsigned char *out, ++ size_t blocks, const AES_KEY *key, ++ unsigned char *ivec); ++ ++static int aes_t4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, ++ const unsigned char *iv, int enc) ++ { ++ int ret, mode, bits; ++ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; ++ ++ mode = ctx->cipher->flags & EVP_CIPH_MODE; ++ bits = ctx->key_len*8; ++ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) ++ && !enc) ++ { ++ ret = 0; ++ aes_t4_set_decrypt_key(key, bits, ctx->cipher_data); ++ dat->block = (block128_f)aes_t4_decrypt; ++ switch (bits) { ++ case 128: ++ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? ++ (cbc128_f)aes128_t4_cbc_decrypt : ++ NULL; ++ break; ++ case 192: ++ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? ++ (cbc128_f)aes192_t4_cbc_decrypt : ++ NULL; ++ break; ++ case 256: ++ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? ++ (cbc128_f)aes256_t4_cbc_decrypt : ++ NULL; ++ break; ++ default: ++ ret = -1; ++ } ++ } ++ else { ++ ret = 0; ++ aes_t4_set_encrypt_key(key, bits, ctx->cipher_data); ++ dat->block = (block128_f)aes_t4_encrypt; ++ switch (bits) { ++ case 128: ++ if (mode==EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f)aes128_t4_cbc_encrypt; ++ else if (mode==EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f)aes128_t4_ctr32_encrypt; ++ else ++ dat->stream.cbc = NULL; ++ break; ++ case 192: ++ if (mode==EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f)aes192_t4_cbc_encrypt; ++ else if (mode==EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f)aes192_t4_ctr32_encrypt; ++ else ++ dat->stream.cbc = NULL; ++ break; ++ case 256: ++ if (mode==EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f)aes256_t4_cbc_encrypt; ++ else if (mode==EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f)aes256_t4_ctr32_encrypt; ++ else ++ dat->stream.cbc = NULL; ++ break; ++ default: ++ ret = -1; ++ } ++ } ++ ++ if(ret < 0) ++ { ++ EVPerr(EVP_F_AES_T4_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); ++ return 0; ++ } ++ ++ return 1; ++ } ++ ++#define aes_t4_cbc_cipher aes_cbc_cipher ++static int aes_t4_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, ++ const unsigned char *in, size_t len); ++ ++#define aes_t4_ecb_cipher aes_ecb_cipher ++static int aes_t4_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, ++ const unsigned char *in, size_t len); ++ ++#define aes_t4_ofb_cipher aes_ofb_cipher ++static int aes_t4_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, ++ const unsigned char *in,size_t len); ++ ++#define aes_t4_cfb_cipher aes_cfb_cipher ++static int aes_t4_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, ++ const unsigned char *in,size_t len); ++ ++#define aes_t4_cfb8_cipher aes_cfb8_cipher ++static int aes_t4_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, ++ const unsigned char *in,size_t len); ++ ++#define aes_t4_cfb1_cipher aes_cfb1_cipher ++static int aes_t4_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, ++ const unsigned char *in,size_t len); ++ ++#define aes_t4_ctr_cipher aes_ctr_cipher ++static int aes_t4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len); ++ ++static int aes_t4_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, ++ const unsigned char *iv, int enc) ++ { ++ EVP_AES_GCM_CTX *gctx = ctx->cipher_data; ++ if (!iv && !key) ++ return 1; ++ if (key) ++ { ++ int bits = ctx->key_len * 8; ++ aes_t4_set_encrypt_key(key, bits, &gctx->ks.ks); ++ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, ++ (block128_f)aes_t4_encrypt); ++ switch (bits) { ++ case 128: ++ gctx->ctr = (ctr128_f)aes128_t4_ctr32_encrypt; ++ break; ++ case 192: ++ gctx->ctr = (ctr128_f)aes192_t4_ctr32_encrypt; ++ break; ++ case 256: ++ gctx->ctr = (ctr128_f)aes256_t4_ctr32_encrypt; ++ break; ++ default: ++ return 0; ++ } ++ /* If we have an iv can set it directly, otherwise use ++ * saved IV. ++ */ ++ if (iv == NULL && gctx->iv_set) ++ iv = gctx->iv; ++ if (iv) ++ { ++ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen); ++ gctx->iv_set = 1; ++ } ++ gctx->key_set = 1; ++ } ++ else ++ { ++ /* If key set use IV, otherwise copy */ ++ if (gctx->key_set) ++ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen); ++ else ++ memcpy(gctx->iv, iv, gctx->ivlen); ++ gctx->iv_set = 1; ++ gctx->iv_gen = 0; ++ } ++ return 1; ++ } ++ ++#define aes_t4_gcm_cipher aes_gcm_cipher ++static int aes_t4_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len); ++ ++static int aes_t4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, ++ const unsigned char *iv, int enc) ++ { ++ EVP_AES_XTS_CTX *xctx = ctx->cipher_data; ++ if (!iv && !key) ++ return 1; ++ ++ if (key) ++ { ++ int bits = ctx->key_len * 4; ++ /* key_len is two AES keys */ ++ if (enc) ++ { ++ aes_t4_set_encrypt_key(key, bits, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f)aes_t4_encrypt; ++#if 0 /* not yet */ ++ switch (bits) { ++ case 128: ++ xctx->stream = aes128_t4_xts_encrypt; ++ break; ++ case 192: ++ xctx->stream = aes192_t4_xts_encrypt; ++ break; ++ case 256: ++ xctx->stream = aes256_t4_xts_encrypt; ++ break; ++ default: ++ return 0; ++ } ++#endif ++ } ++ else ++ { ++ aes_t4_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f)aes_t4_decrypt; ++#if 0 /* not yet */ ++ switch (bits) { ++ case 128: ++ xctx->stream = aes128_t4_xts_decrypt; ++ break; ++ case 192: ++ xctx->stream = aes192_t4_xts_decrypt; ++ break; ++ case 256: ++ xctx->stream = aes256_t4_xts_decrypt; ++ break; ++ default: ++ return 0; ++ } ++#endif ++ } ++ ++ aes_t4_set_encrypt_key(key + ctx->key_len/2, ++ ctx->key_len * 4, &xctx->ks2.ks); ++ xctx->xts.block2 = (block128_f)aes_t4_encrypt; ++ ++ xctx->xts.key1 = &xctx->ks1; ++ } ++ ++ if (iv) ++ { ++ xctx->xts.key2 = &xctx->ks2; ++ memcpy(ctx->iv, iv, 16); ++ } ++ ++ return 1; ++ } ++ ++#define aes_t4_xts_cipher aes_xts_cipher ++static int aes_t4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len); ++ ++static int aes_t4_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, ++ const unsigned char *iv, int enc) ++ { ++ EVP_AES_CCM_CTX *cctx = ctx->cipher_data; ++ if (!iv && !key) ++ return 1; ++ if (key) ++ { ++ int bits = ctx->key_len * 8; ++ aes_t4_set_encrypt_key(key, bits, &cctx->ks.ks); ++ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, ++ &cctx->ks, (block128_f)aes_t4_encrypt); ++#if 0 /* not yet */ ++ switch (bits) { ++ case 128: ++ cctx->str = enc?(ccm128_f)aes128_t4_ccm64_encrypt : ++ (ccm128_f)ae128_t4_ccm64_decrypt; ++ break; ++ case 192: ++ cctx->str = enc?(ccm128_f)aes192_t4_ccm64_encrypt : ++ (ccm128_f)ae192_t4_ccm64_decrypt; ++ break; ++ case 256: ++ cctx->str = enc?(ccm128_f)aes256_t4_ccm64_encrypt : ++ (ccm128_f)ae256_t4_ccm64_decrypt; ++ break; ++ default: ++ return 0; ++ } ++#endif ++ cctx->key_set = 1; ++ } ++ if (iv) ++ { ++ memcpy(ctx->iv, iv, 15 - cctx->L); ++ cctx->iv_set = 1; ++ } ++ return 1; ++ } ++ ++#define aes_t4_ccm_cipher aes_ccm_cipher ++static int aes_t4_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len); ++ ++#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \ ++static const EVP_CIPHER aes_t4_##keylen##_##mode = { \ ++ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \ ++ flags|EVP_CIPH_##MODE##_MODE, \ ++ aes_t4_init_key, \ ++ aes_t4_##mode##_cipher, \ ++ NULL, \ ++ sizeof(EVP_AES_KEY), \ ++ NULL,NULL,NULL,NULL }; \ ++static const EVP_CIPHER aes_##keylen##_##mode = { \ ++ nid##_##keylen##_##nmode,blocksize, \ ++ keylen/8,ivlen, \ ++ flags|EVP_CIPH_##MODE##_MODE, \ ++ aes_init_key, \ ++ aes_##mode##_cipher, \ ++ NULL, \ ++ sizeof(EVP_AES_KEY), \ ++ NULL,NULL,NULL,NULL }; \ ++const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ ++{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; } ++ ++#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \ ++static const EVP_CIPHER aes_t4_##keylen##_##mode = { \ ++ nid##_##keylen##_##mode,blocksize, \ ++ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \ ++ flags|EVP_CIPH_##MODE##_MODE, \ ++ aes_t4_##mode##_init_key, \ ++ aes_t4_##mode##_cipher, \ ++ aes_##mode##_cleanup, \ ++ sizeof(EVP_AES_##MODE##_CTX), \ ++ NULL,NULL,aes_##mode##_ctrl,NULL }; \ ++static const EVP_CIPHER aes_##keylen##_##mode = { \ ++ nid##_##keylen##_##mode,blocksize, \ ++ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \ ++ flags|EVP_CIPH_##MODE##_MODE, \ ++ aes_##mode##_init_key, \ ++ aes_##mode##_cipher, \ ++ aes_##mode##_cleanup, \ ++ sizeof(EVP_AES_##MODE##_CTX), \ ++ NULL,NULL,aes_##mode##_ctrl,NULL }; \ ++const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ ++{ return SPARC_AES_CAPABLE?&aes_t4_##keylen##_##mode:&aes_##keylen##_##mode; } ++ + #else + + #define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \ +@@ -505,7 +879,7 @@ + #ifdef BSAES_CAPABLE + if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE) + { +- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); ++ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)AES_decrypt; + dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt; + } +@@ -514,7 +888,7 @@ + #ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { +- ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks); ++ ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)vpaes_decrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)vpaes_cbc_encrypt : +@@ -523,7 +897,7 @@ + else + #endif + { +- ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); ++ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)AES_decrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)AES_cbc_encrypt : +@@ -533,7 +907,7 @@ + #ifdef BSAES_CAPABLE + if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE) + { +- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); ++ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)AES_encrypt; + dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; + } +@@ -542,7 +916,7 @@ + #ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { +- ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks); ++ ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)vpaes_encrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)vpaes_cbc_encrypt : +@@ -551,7 +925,7 @@ + else + #endif + { +- ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); ++ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); + dat->block = (block128_f)AES_encrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)AES_cbc_encrypt : +@@ -825,7 +1199,7 @@ + #ifdef BSAES_CAPABLE + if (BSAES_CAPABLE) + { +- AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); ++ AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); + CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, + (block128_f)AES_encrypt); + gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; +@@ -836,7 +1210,7 @@ + #ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { +- vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); ++ vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); + CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, + (block128_f)vpaes_encrypt); + gctx->ctr = NULL; +@@ -843,7 +1217,7 @@ + break; + } + #endif +- AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); ++ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks); + CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt); + #ifdef AES_CTR_ASM + gctx->ctr = (ctr128_f)AES_ctr32_encrypt; +@@ -1074,17 +1448,17 @@ + { + if (enc) + { +- vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); ++ vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)vpaes_encrypt; + } + else + { +- vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); ++ vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)vpaes_decrypt; + } + + vpaes_set_encrypt_key(key + ctx->key_len/2, +- ctx->key_len * 4, &xctx->ks2); ++ ctx->key_len * 4, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f)vpaes_encrypt; + + xctx->xts.key1 = &xctx->ks1; +@@ -1093,17 +1467,17 @@ + #endif + if (enc) + { +- AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); ++ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)AES_encrypt; + } + else + { +- AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); ++ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f)AES_decrypt; + } + + AES_set_encrypt_key(key + ctx->key_len/2, +- ctx->key_len * 4, &xctx->ks2); ++ ctx->key_len * 4, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f)AES_encrypt; + + xctx->xts.key1 = &xctx->ks1; +@@ -1214,7 +1588,7 @@ + #ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { +- vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks); ++ vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks); + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)vpaes_encrypt); + cctx->str = NULL; +@@ -1222,7 +1596,7 @@ + break; + } + #endif +- AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); ++ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks); + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)AES_encrypt); + cctx->str = NULL; +@@ -1310,5 +1684,4 @@ + BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS) + BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS) + +-#endif + #endif +Index: openssl/crypto/evp/evp.h +=================================================================== +--- evp.h Mon Feb 11 07:26:04 2013 ++++ evp.h.new Thu May 2 14:31:55 2013 +@@ -1256,6 +1256,7 @@ + #define EVP_F_AESNI_INIT_KEY 165 + #define EVP_F_AESNI_XTS_CIPHER 176 + #define EVP_F_AES_INIT_KEY 133 ++#define EVP_F_AES_T4_INIT_KEY 178 + #define EVP_F_AES_XTS 172 + #define EVP_F_AES_XTS_CIPHER 175 + #define EVP_F_ALG_MODULE_INIT 177 +Index: openssl/crypto/evp/evp_err.c +=================================================================== +--- evp_err.c Mon Feb 11 07:26:04 2013 ++++ evp_err.c.new Thu May 2 14:33:24 2013 +@@ -73,6 +73,7 @@ + {ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"}, + {ERR_FUNC(EVP_F_AESNI_XTS_CIPHER), "AESNI_XTS_CIPHER"}, + {ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"}, ++{ERR_FUNC(EVP_F_AES_T4_INIT_KEY), "AES_T4_INIT_KEY"}, + {ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"}, + {ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"}, + {ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},