components/openssl/openssl-1.0.1/engines/t4/t4_aes.S
author Misaki Miyashita <Misaki.Miyashita@Oracle.COM>
Fri, 15 Feb 2013 07:58:18 -0800
changeset 1158 227137d9fbce
parent 603 components/openssl/openssl-1.0.0/engines/t4/t4_aes.S@1b966e9a6b03
permissions -rw-r--r--
PSARC 2013/034 OpenSSL 1.0.1 15824597 SUNBT7206149 Resynch with the latest OpenSSL (1.0.1e)

/*
 * ====================================================================
 * Copyright (c) 1998-2011 The OpenSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. All advertising materials mentioning features or use of this
 *    software must display the following acknowledgment:
 *    "This product includes software developed by the OpenSSL Project
 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
 *
 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
 *    endorse or promote products derived from this software without
 *    prior written permission. For written permission, please contact
 *    [email protected].
 *
 * 5. Products derived from this software may not be called "OpenSSL"
 *    nor may "OpenSSL" appear in their names without prior written
 *    permission of the OpenSSL Project.
 *
 * 6. Redistributions of any form whatsoever must retain the following
 *    acknowledgment:
 *    "This product includes software developed by the OpenSSL Project
 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
 *
 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 * ====================================================================
 */

/*
 * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
 */

/*LINTLIBRARY*/

#if defined(lint) || defined(__lint)


#include <sys/types.h>

/*ARGSUSED*/
void t4_aes_expand128(uint64_t *rk, const uint32_t *key)
{ return; }

/*ARGSUSED*/
void t4_aes_expand192(uint64_t *rk, const uint32_t *key)
{ return; }

/*ARGSUSED*/
void t4_aes_expand256(uint64_t *rk, const uint32_t *key)
{ return; }

void t4_aes128_load_keys_for_encrypt(uint64_t *ks)
{ return; }

/*ARGSUSED*/
void t4_aes192_load_keys_for_encrypt(uint64_t *ks)
{ return; }

/*ARGSUSED*/
void t4_aes256_load_keys_for_encrypt(uint64_t *ks)
{ return; }

/*ARGSUSED*/
void t4_aes128_ecb_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes192_ecb_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes256_ecb_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes128_cbc_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes192_cbc_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes256_cbc_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes128_ctr_crypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes192_ctr_crypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes256_ctr_crypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes128_cfb128_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes192_cfb128_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes256_cfb128_encrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

void t4_aes128_load_keys_for_decrypt(uint64_t *ks)
{ return; }

/*ARGSUSED*/
void t4_aes192_load_keys_for_decrypt(uint64_t *ks)
{ return; }

/*ARGSUSED*/
void t4_aes256_load_keys_for_decrypt(uint64_t *ks)
{ return; }

/*ARGSUSED*/
void t4_aes128_ecb_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes192_ecb_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes256_ecb_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes128_cbc_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes192_cbc_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes256_cbc_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes128_cfb128_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes192_cfb128_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

/*ARGSUSED*/
void t4_aes256_cfb128_decrypt(uint64_t *ks, uint64_t *asm_in,
    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
{ return; }

#else	/* lint || __lint */

#include<sys/asm_linkage.h>


	ENTRY(t4_aes_expand128)

!load key
	ld	[%o1], %f0
	ld	[%o1 + 0x4], %f1
	ld	[%o1 + 0x8], %f2
	ld	[%o1 + 0xc], %f3

!expand the key
	!aes_kexpand1 %f0, %f2, 0x0, %f4
	!aes_kexpand2 %f2, %f4, %f6
	!aes_kexpand1 %f4, %f6, 0x1, %f8
	!aes_kexpand2 %f6, %f8, %f10
	!aes_kexpand1 %f8, %f10, 0x2, %f12
	!aes_kexpand2 %f10, %f12, %f14
	!aes_kexpand1 %f12, %f14, 0x3, %f16
	!aes_kexpand2 %f14, %f16, %f18
	!aes_kexpand1 %f16, %f18, 0x4, %f20
	!aes_kexpand2 %f18, %f20, %f22
	!aes_kexpand1 %f20, %f22, 0x5, %f24
	!aes_kexpand2 %f22, %f24, %f26
	!aes_kexpand1 %f24, %f26, 0x6, %f28
	!aes_kexpand2 %f26, %f28, %f30
	!aes_kexpand1 %f28, %f30, 0x7, %f32
	!aes_kexpand2 %f30, %f32, %f34
	!aes_kexpand1 %f32, %f34, 0x8, %f36
	!aes_kexpand2 %f34, %f36, %f38
	!aes_kexpand1 %f36, %f38, 0x9, %f40
	!aes_kexpand2 %f38, %f40, %f42
	.byte	0x88, 0xc8, 0x01, 0x02
	.byte	0x8d, 0xb0, 0xa6, 0x24
	.byte	0x90, 0xc9, 0x03, 0x06
	.byte	0x95, 0xb1, 0xa6, 0x28
	.byte	0x98, 0xca, 0x05, 0x0a
	.byte	0x9d, 0xb2, 0xa6, 0x2c
	.byte	0xa0, 0xcb, 0x07, 0x0e
	.byte	0xa5, 0xb3, 0xa6, 0x30
	.byte	0xa8, 0xcc, 0x09, 0x12
	.byte	0xad, 0xb4, 0xa6, 0x34
	.byte	0xb0, 0xcd, 0x0b, 0x16
	.byte	0xb5, 0xb5, 0xa6, 0x38
	.byte	0xb8, 0xce, 0x0d, 0x1a
	.byte	0xbd, 0xb6, 0xa6, 0x3c
	.byte	0x82, 0xcf, 0x0f, 0x1e
	.byte	0x87, 0xb7, 0xa6, 0x21
	.byte	0x8a, 0xc8, 0x51, 0x03
	.byte	0x8f, 0xb0, 0xe6, 0x25
	.byte	0x92, 0xc9, 0x53, 0x07
	.byte	0x97, 0xb1, 0xe6, 0x29

!copy expanded key back into array
	std	%f4, [%o0]
	std	%f6, [%o0 + 0x8]
	std	%f8, [%o0 + 0x10]
	std	%f10, [%o0 + 0x18]
	std	%f12, [%o0 + 0x20]
	std	%f14, [%o0 + 0x28]
	std	%f16, [%o0 + 0x30]
	std	%f18, [%o0 + 0x38]
	std	%f20, [%o0 + 0x40]
	std	%f22, [%o0 + 0x48]
	std	%f24, [%o0 + 0x50]
	std	%f26, [%o0 + 0x58]
	std	%f28, [%o0 + 0x60]
	std	%f30, [%o0 + 0x68]
	std	%f32, [%o0 + 0x70]
	std	%f34, [%o0 + 0x78]
	std	%f36, [%o0 + 0x80]
	std	%f38, [%o0 + 0x88]
	std	%f40, [%o0 + 0x90]
	retl
	std	%f42, [%o0 + 0x98]

	SET_SIZE(t4_aes_expand128)


	ENTRY(t4_aes_expand192)

!load key
	ld	[%o1], %f0
	ld	[%o1 + 0x4], %f1
	ld	[%o1 + 0x8], %f2
	ld	[%o1 + 0xc], %f3
	ld	[%o1 + 0x10], %f4
	ld	[%o1 + 0x14], %f5

!expand the key
	!aes_kexpand1 %f0, %f4, 0x0, %f6
	!aes_kexpand2 %f2, %f6, %f8
	!aes_kexpand2 %f4, %f8, %f10

	!aes_kexpand1 %f6, %f10, 0x1, %f12
	!aes_kexpand2 %f8, %f12, %f14
	!aes_kexpand2 %f10, %f14, %f16

	!aes_kexpand1 %f12, %f16, 0x2, %f18
	!aes_kexpand2 %f14, %f18, %f20
	!aes_kexpand2 %f16, %f20, %f22

	!aes_kexpand1 %f18, %f22, 0x3, %f24
	!aes_kexpand2 %f20, %f24, %f26
	!aes_kexpand2 %f22, %f26, %f28

	!aes_kexpand1 %f24, %f28, 0x4, %f30
	!aes_kexpand2 %f26, %f30, %f32
	!aes_kexpand2 %f28, %f32, %f34

	!aes_kexpand1 %f30, %f34, 0x5, %f36
	!aes_kexpand2 %f32, %f36, %f38
	!aes_kexpand2 %f34, %f38, %f40

	!aes_kexpand1 %f36, %f40, 0x6, %f42
	!aes_kexpand2 %f38, %f42, %f44
	!aes_kexpand2 %f40, %f44, %f46

	!aes_kexpand1 %f42, %f46, 0x7, %f48
	!aes_kexpand2 %f44, %f48, %f50
	.byte	0x8c, 0xc8, 0x01, 0x04
	.byte	0x91, 0xb0, 0xa6, 0x26
	.byte	0x95, 0xb1, 0x26, 0x28
	.byte	0x98, 0xc9, 0x83, 0x0a
	.byte	0x9d, 0xb2, 0x26, 0x2c
	.byte	0xa1, 0xb2, 0xa6, 0x2e
	.byte	0xa4, 0xcb, 0x05, 0x10
	.byte	0xa9, 0xb3, 0xa6, 0x32
	.byte	0xad, 0xb4, 0x26, 0x34
	.byte	0xb0, 0xcc, 0x87, 0x16
	.byte	0xb5, 0xb5, 0x26, 0x38
	.byte	0xb9, 0xb5, 0xa6, 0x3a
	.byte	0xbc, 0xce, 0x09, 0x1c
	.byte	0x83, 0xb6, 0xa6, 0x3e
	.byte	0x87, 0xb7, 0x26, 0x21
	.byte	0x8a, 0xcf, 0x8b, 0x03
	.byte	0x8f, 0xb0, 0x66, 0x25
	.byte	0x93, 0xb0, 0xe6, 0x27
	.byte	0x96, 0xc9, 0x4d, 0x09
	.byte	0x9b, 0xb1, 0xe6, 0x2b
	.byte	0x9f, 0xb2, 0x66, 0x2d
	.byte	0xa2, 0xca, 0xcf, 0x0f
	.byte	0xa7, 0xb3, 0x66, 0x31

!copy expanded key back into array
	std	%f6, [%o0]
	std	%f8, [%o0 + 0x8]
	std	%f10, [%o0 + 0x10]
	std	%f12, [%o0 + 0x18]
	std	%f14, [%o0 + 0x20]
	std	%f16, [%o0 + 0x28]
	std	%f18, [%o0 + 0x30]
	std	%f20, [%o0 + 0x38]
	std	%f22, [%o0 + 0x40]
	std	%f24, [%o0 + 0x48]
	std	%f26, [%o0 + 0x50]
	std	%f28, [%o0 + 0x58]
	std	%f30, [%o0 + 0x60]
	std	%f32, [%o0 + 0x68]
	std	%f34, [%o0 + 0x70]
	std	%f36, [%o0 + 0x78]
	std	%f38, [%o0 + 0x80]
	std	%f40, [%o0 + 0x88]
	std	%f42, [%o0 + 0x90]
	std	%f44, [%o0 + 0x98]
	std	%f46, [%o0 + 0xa0]
	std	%f48, [%o0 + 0xa8]
	retl
	std	%f50, [%o0 + 0xb0]

	SET_SIZE(t4_aes_expand192)


	ENTRY(t4_aes_expand256)

!load key
	ld	[%o1], %f0
	ld	[%o1 + 0x4], %f1
	ld	[%o1 + 0x8], %f2
	ld	[%o1 + 0xc], %f3
	ld	[%o1 + 0x10], %f4
	ld	[%o1 + 0x14], %f5
	ld	[%o1 + 0x18], %f6
	ld	[%o1 + 0x1c], %f7

!expand the key
	!aes_kexpand1 %f0, %f6, 0x0, %f8
	!aes_kexpand2 %f2, %f8, %f10
	!aes_kexpand0 %f4, %f10, %f12
	!aes_kexpand2 %f6, %f12, %f14

	!aes_kexpand1 %f8, %f14, 0x1, %f16
	!aes_kexpand2 %f10, %f16, %f18
	!aes_kexpand0 %f12, %f18, %f20
	!aes_kexpand2 %f14, %f20, %f22

	!aes_kexpand1 %f16, %f22, 0x2, %f24
	!aes_kexpand2 %f18, %f24, %f26
	!aes_kexpand0 %f20, %f26, %f28
	!aes_kexpand2 %f22, %f28, %f30

	!aes_kexpand1 %f24, %f30, 0x3, %f32
	!aes_kexpand2 %f26, %f32, %f34
	!aes_kexpand0 %f28, %f34, %f36
	!aes_kexpand2 %f30, %f36, %f38

	!aes_kexpand1 %f32, %f38, 0x4, %f40
	!aes_kexpand2 %f34, %f40, %f42
	!aes_kexpand0 %f36, %f42, %f44
	!aes_kexpand2 %f38, %f44, %f46

	!aes_kexpand1 %f40, %f46, 0x5, %f48
	!aes_kexpand2 %f42, %f48, %f50
	!aes_kexpand0 %f44, %f50, %f52
	!aes_kexpand2 %f46, %f52, %f54

	!aes_kexpand1 %f48, %f54, 0x6, %f56
	!aes_kexpand2 %f50, %f56, %f58
	.byte	0x90, 0xc8, 0x01, 0x06
	.byte	0x95, 0xb0, 0xa6, 0x28
	.byte	0x99, 0xb1, 0x26, 0x0a
	.byte	0x9d, 0xb1, 0xa6, 0x2c
	.byte	0xa0, 0xca, 0x03, 0x0e
	.byte	0xa5, 0xb2, 0xa6, 0x30
	.byte	0xa9, 0xb3, 0x26, 0x12
	.byte	0xad, 0xb3, 0xa6, 0x34
	.byte	0xb0, 0xcc, 0x05, 0x16
	.byte	0xb5, 0xb4, 0xa6, 0x38
	.byte	0xb9, 0xb5, 0x26, 0x1a
	.byte	0xbd, 0xb5, 0xa6, 0x3c
	.byte	0x82, 0xce, 0x07, 0x1e
	.byte	0x87, 0xb6, 0xa6, 0x21
	.byte	0x8b, 0xb7, 0x26, 0x03
	.byte	0x8f, 0xb7, 0xa6, 0x25
	.byte	0x92, 0xc8, 0x49, 0x07
	.byte	0x97, 0xb0, 0xe6, 0x29
	.byte	0x9b, 0xb1, 0x66, 0x0b
	.byte	0x9f, 0xb1, 0xe6, 0x2d
	.byte	0xa2, 0xca, 0x4b, 0x0f
	.byte	0xa7, 0xb2, 0xe6, 0x31
	.byte	0xab, 0xb3, 0x66, 0x13
	.byte	0xaf, 0xb3, 0xe6, 0x35
	.byte	0xb2, 0xcc, 0x4d, 0x17
	.byte	0xb7, 0xb4, 0xe6, 0x39

!copy expanded key back into array
	std	%f8, [%o0]
	std	%f10, [%o0 + 0x8]
	std	%f12, [%o0 + 0x10]
	std	%f14, [%o0 + 0x18]
	std	%f16, [%o0 + 0x20]
	std	%f18, [%o0 + 0x28]
	std	%f20, [%o0 + 0x30]
	std	%f22, [%o0 + 0x38]
	std	%f24, [%o0 + 0x40]
	std	%f26, [%o0 + 0x48]
	std	%f28, [%o0 + 0x50]
	std	%f30, [%o0 + 0x58]
	std	%f32, [%o0 + 0x60]
	std	%f34, [%o0 + 0x68]
	std	%f36, [%o0 + 0x70]
	std	%f38, [%o0 + 0x78]
	std	%f40, [%o0 + 0x80]
	std	%f42, [%o0 + 0x88]
	std	%f44, [%o0 + 0x90]
	std	%f46, [%o0 + 0x98]
	std	%f48, [%o0 + 0xa0]
	std	%f50, [%o0 + 0xa8]
	std	%f52, [%o0 + 0xb0]
	std	%f54, [%o0 + 0xb8]
	std	%f56, [%o0 + 0xc0]
	retl
	std	%f58, [%o0 + 0xc8]

	SET_SIZE(t4_aes_expand256)


#define	FIRST_TWO_EROUNDS \
	.byte	0xb2, 0xc8, 0x3e, 0x1d ; \
	.byte	0xb6, 0xc8, 0xbe, 0x3d ; \
	.byte	0xba, 0xc9, 0x36, 0x19 ; \
	.byte	0xbe, 0xc9, 0xb6, 0x39
	!aes_eround01	%f0, %f60, %f62, %f56 ; \
	!aes_eround23	%f2, %f60, %f62, %f58 ; \
	!aes_eround01	%f4, %f56, %f58, %f60 ; \
	!aes_eround23	%f6, %f56, %f58, %f62

#define	MID_TWO_EROUNDS \
	.byte	0xb2, 0xca, 0x3e, 0x1d ; \
	.byte	0xb6, 0xca, 0xbe, 0x3d ; \
	.byte	0xba, 0xcb, 0x36, 0x19 ; \
	.byte	0xbe, 0xcb, 0xb6, 0x39
	!aes_eround01	%f8, %f60, %f62, %f56 ; \
	!aes_eround23	%f10, %f60, %f62, %f58 ; \
	!aes_eround01	%f12, %f56, %f58, %f60 ; \
	!aes_eround23	%f14, %f56, %f58, %f62

#define	MID_TWO_EROUNDS_2 \
	.byte	0x8c, 0xca, 0x04, 0x00 ; \
	.byte	0x88, 0xca, 0x84, 0x20 ; \
	.byte	0xb2, 0xca, 0x3e, 0x1d ; \
	.byte	0xb6, 0xca, 0xbe, 0x3d ; \
	.byte	0x80, 0xcb, 0x08, 0x06 ; \
	.byte	0x84, 0xcb, 0x88, 0x26 ; \
	.byte	0xba, 0xcb, 0x36, 0x19 ; \
	.byte	0xbe, 0xcb, 0xb6, 0x39
	!aes_eround01	%f8, %f0, %f2, %f6 ; \
	!aes_eround23	%f10, %f0, %f2, %f4 ; \
	!aes_eround01	%f8, %f60, %f62, %f56 ; \
	!aes_eround23	%f10, %f60, %f62, %f58 ; \
	!aes_eround01	%f12, %f6, %f4, %f0 ; \
	!aes_eround23	%f14, %f6, %f4, %f2 ; \
	!aes_eround01	%f12, %f56, %f58, %f60 ; \
	!aes_eround23	%f14, %f56, %f58, %f62

#define	TEN_EROUNDS \
	.byte	0xb2, 0xcc, 0x3e, 0x1d ; \
	.byte	0xb6, 0xcc, 0xbe, 0x3d ; \
	.byte	0xba, 0xcd, 0x36, 0x19 ; \
	.byte	0xbe, 0xcd, 0xb6, 0x39 ; \
	.byte	0xb2, 0xce, 0x3e, 0x1d ; \
	.byte	0xb6, 0xce, 0xbe, 0x3d ; \
	.byte	0xba, 0xcf, 0x36, 0x19 ; \
	.byte	0xbe, 0xcf, 0xb6, 0x39 ; \
	.byte	0xb2, 0xc8, 0x7e, 0x1d ; \
	.byte	0xb6, 0xc8, 0xfe, 0x3d ; \
	.byte	0xba, 0xc9, 0x76, 0x19 ; \
	.byte	0xbe, 0xc9, 0xf6, 0x39 ; \
	.byte	0xb2, 0xca, 0x7e, 0x1d ; \
	.byte	0xb6, 0xca, 0xfe, 0x3d ; \
	.byte	0xba, 0xcb, 0x76, 0x19 ; \
	.byte	0xbe, 0xcb, 0xf6, 0x39 ; \
	.byte	0xb2, 0xcc, 0x7e, 0x1d ; \
	.byte	0xb6, 0xcc, 0xfe, 0x3d ; \
	.byte	0xba, 0xcd, 0x76, 0x99 ; \
	.byte	0xbe, 0xcd, 0xf6, 0xb9
	!aes_eround01	%f16, %f60, %f62, %f56 ; \
	!aes_eround23	%f18, %f60, %f62, %f58 ; \
	!aes_eround01	%f20, %f56, %f58, %f60 ; \
	!aes_eround23	%f22, %f56, %f58, %f62 ; \
	!aes_eround01	%f24, %f60, %f62, %f56 ; \
	!aes_eround23	%f26, %f60, %f62, %f58 ; \
	!aes_eround01	%f28, %f56, %f58, %f60 ; \
	!aes_eround23	%f30, %f56, %f58, %f62 ; \
	!aes_eround01	%f32, %f60, %f62, %f56 ; \
	!aes_eround23	%f34, %f60, %f62, %f58 ; \
	!aes_eround01	%f36, %f56, %f58, %f60 ; \
	!aes_eround23	%f38, %f56, %f58, %f62 ; \
	!aes_eround01	%f40, %f60, %f62, %f56 ; \
	!aes_eround23	%f42, %f60, %f62, %f58 ; \
	!aes_eround01	%f44, %f56, %f58, %f60 ; \
	!aes_eround23	%f46, %f56, %f58, %f62 ; \
	!aes_eround01	%f48, %f60, %f62, %f56 ; \
	!aes_eround23	%f50, %f60, %f62, %f58 ; \
	!aes_eround01_l	%f52, %f56, %f58, %f60 ; \
	!aes_eround23_l	%f54, %f56, %f58, %f62

#define	TEN_EROUNDS_2 \
	.byte	0x8c, 0xcc, 0x04, 0x00 ; \
	.byte	0x88, 0xcc, 0x84, 0x20 ; \
	.byte	0xb2, 0xcc, 0x3e, 0x1d ; \
	.byte	0xb6, 0xcc, 0xbe, 0x3d ; \
	.byte	0x80, 0xcd, 0x08, 0x06 ; \
	.byte	0x84, 0xcd, 0x88, 0x26 ; \
	.byte	0xba, 0xcd, 0x36, 0x19 ; \
	.byte	0xbe, 0xcd, 0xb6, 0x39 ; \
	.byte	0x8c, 0xce, 0x04, 0x00 ; \
	.byte	0x88, 0xce, 0x84, 0x20 ; \
	.byte	0xb2, 0xce, 0x3e, 0x1d ; \
	.byte	0xb6, 0xce, 0xbe, 0x3d ; \
	.byte	0x80, 0xcf, 0x08, 0x06 ; \
	.byte	0x84, 0xcf, 0x88, 0x26 ; \
	.byte	0xba, 0xcf, 0x36, 0x19 ; \
	.byte	0xbe, 0xcf, 0xb6, 0x39 ; \
	.byte	0x8c, 0xc8, 0x44, 0x00 ; \
	.byte	0x88, 0xc8, 0xc4, 0x20 ; \
	.byte	0xb2, 0xc8, 0x7e, 0x1d ; \
	.byte	0xb6, 0xc8, 0xfe, 0x3d ; \
	.byte	0x80, 0xc9, 0x48, 0x06 ; \
	.byte	0x84, 0xc9, 0xc8, 0x26 ; \
	.byte	0xba, 0xc9, 0x76, 0x19 ; \
	.byte	0xbe, 0xc9, 0xf6, 0x39 ; \
	.byte	0x8c, 0xca, 0x44, 0x00 ; \
	.byte	0x88, 0xca, 0xc4, 0x20 ; \
	.byte	0xb2, 0xca, 0x7e, 0x1d ; \
	.byte	0xb6, 0xca, 0xfe, 0x3d ; \
	.byte	0x80, 0xcb, 0x48, 0x06 ; \
	.byte	0x84, 0xcb, 0xc8, 0x26 ; \
	.byte	0xba, 0xcb, 0x76, 0x19 ; \
	.byte	0xbe, 0xcb, 0xf6, 0x39 ; \
	.byte	0x8c, 0xcc, 0x44, 0x00 ; \
	.byte	0x88, 0xcc, 0xc4, 0x20 ; \
	.byte	0xb2, 0xcc, 0x7e, 0x1d ; \
	.byte	0xb6, 0xcc, 0xfe, 0x3d ; \
	.byte	0x80, 0xcd, 0x48, 0x86 ; \
	.byte	0x84, 0xcd, 0xc8, 0xa6 ; \
	.byte	0xba, 0xcd, 0x76, 0x99 ; \
	.byte	0xbe, 0xcd, 0xf6, 0xb9
	!aes_eround01	%f16, %f0, %f2, %f6 ; \
	!aes_eround23	%f18, %f0, %f2, %f4 ; \
	!aes_eround01	%f16, %f60, %f62, %f56 ; \
	!aes_eround23	%f18, %f60, %f62, %f58 ; \
	!aes_eround01	%f20, %f6, %f4, %f0 ; \
	!aes_eround23	%f22, %f6, %f4, %f2 ; \
	!aes_eround01	%f20, %f56, %f58, %f60 ; \
	!aes_eround23	%f22, %f56, %f58, %f62 ; \
	!aes_eround01	%f24, %f0, %f2, %f6 ; \
	!aes_eround23	%f26, %f0, %f2, %f4 ; \
	!aes_eround01	%f24, %f60, %f62, %f56 ; \
	!aes_eround23	%f26, %f60, %f62, %f58 ; \
	!aes_eround01	%f28, %f6, %f4, %f0 ; \
	!aes_eround23	%f30, %f6, %f4, %f2 ; \
	!aes_eround01	%f28, %f56, %f58, %f60 ; \
	!aes_eround23	%f30, %f56, %f58, %f62 ; \
	!aes_eround01	%f32, %f0, %f2, %f6 ; \
	!aes_eround23	%f34, %f0, %f2, %f4 ; \
	!aes_eround01	%f32, %f60, %f62, %f56 ; \
	!aes_eround23	%f34, %f60, %f62, %f58 ; \
	!aes_eround01	%f36, %f6, %f4, %f0 ; \
	!aes_eround23	%f38, %f6, %f4, %f2 ; \
	!aes_eround01	%f36, %f56, %f58, %f60 ; \
	!aes_eround23	%f38, %f56, %f58, %f62 ; \
	!aes_eround01	%f40, %f0, %f2, %f6 ; \
	!aes_eround23	%f42, %f0, %f2, %f4 ; \
	!aes_eround01	%f40, %f60, %f62, %f56 ; \
	!aes_eround23	%f42, %f60, %f62, %f58 ; \
	!aes_eround01	%f44, %f6, %f4, %f0 ; \
	!aes_eround23	%f46, %f6, %f4, %f2 ; \
	!aes_eround01	%f44, %f56, %f58, %f60 ; \
	!aes_eround23	%f46, %f56, %f58, %f62 ; \
	!aes_eround01	%f48, %f0, %f2, %f6 ; \
	!aes_eround23	%f50, %f0, %f2, %f4 ; \
	!aes_eround01	%f48, %f60, %f62, %f56 ; \
	!aes_eround23	%f50, %f60, %f62, %f58 ; \
	!aes_eround01_l	%f52, %f6, %f4, %f0 ; \
	!aes_eround23_l	%f54, %f6, %f4, %f2 ; \
	!aes_eround01_l	%f52, %f56, %f58, %f60 ; \
	!aes_eround23_l	%f54, %f56, %f58, %f62

#define	TWELVE_EROUNDS \
	MID_TWO_EROUNDS	; \
	TEN_EROUNDS

#define	TWELVE_EROUNDS_2 \
	MID_TWO_EROUNDS_2	; \
	TEN_EROUNDS_2

#define	FOURTEEN_EROUNDS \
	FIRST_TWO_EROUNDS ; \
	TWELVE_EROUNDS

#define	FOURTEEN_EROUNDS_2 \
	.byte	0xb0, 0xc8, 0x2c, 0x14 ; \
	.byte	0xac, 0xc8, 0xac, 0x34 ; \
	ldd	[%o0 + 0x60], %f20 ; \
	.byte	0xb2, 0xc8, 0x3e, 0x1d ; \
	.byte	0xb6, 0xc8, 0xbe, 0x3d ; \
	.byte	0x80, 0xc9, 0x2c, 0x18 ; \
	.byte	0x84, 0xc9, 0xac, 0x38 ;\
	ldd	[%o0 + 0x68], %f22 ; \
	.byte	0xba, 0xc9, 0x36, 0x19 ; \
	ldd	[%o0 + 0x70], %f24 ; \
	.byte	0xbe, 0xc9, 0xb6, 0x39 ; \
	.byte	0x8c, 0xca, 0x04, 0x00 ; \
	.byte	0x88, 0xca, 0x84, 0x20 ; \
	.byte	0xb2, 0xca, 0x3e, 0x1d ; \
	.byte	0xb6, 0xca, 0xbe, 0x3d ; \
	.byte	0x80, 0xcb, 0x08, 0x06 ; \
	.byte	0x84, 0xcb, 0x88, 0x26 ; \
	.byte	0xba, 0xcb, 0x36, 0x19 ; \
	.byte	0xbe, 0xcb, 0xb6, 0x39 ; \
	.byte	0x8c, 0xcc, 0x04, 0x00 ; \
	.byte	0x88, 0xcc, 0x84, 0x20 ; \
	.byte	0xb2, 0xcc, 0x3e, 0x1d ; \
	.byte	0xb6, 0xcc, 0xbe, 0x3d ; \
	.byte	0x80, 0xcd, 0x08, 0x06 ; \
	.byte	0x84, 0xcd, 0x88, 0x26 ; \
	.byte	0xba, 0xcd, 0x36, 0x19 ; \
	.byte	0xbe, 0xcd, 0xb6, 0x39 ; \
	.byte	0x8c, 0xce, 0x04, 0x00 ; \
	.byte	0x88, 0xce, 0x84, 0x20 ; \
	.byte	0xb2, 0xce, 0x3e, 0x1d ; \
	.byte	0xb6, 0xce, 0xbe, 0x3d ; \
	.byte	0x80, 0xcf, 0x08, 0x06 ; \
	.byte	0x84, 0xcf, 0x88, 0x26 ; \
	.byte	0xba, 0xcf, 0x36, 0x19 ; \
	.byte	0xbe, 0xcf, 0xb6, 0x39 ; \
	.byte	0x8c, 0xc8, 0x44, 0x00 ; \
	.byte	0x88, 0xc8, 0xc4, 0x20 ; \
	.byte	0xb2, 0xc8, 0x7e, 0x1d ; \
	.byte	0xb6, 0xc8, 0xfe, 0x3d ; \
	.byte	0x80, 0xc9, 0x48, 0x06 ; \
	.byte	0x84, 0xc9, 0xc8, 0x26 ; \
	.byte	0xba, 0xc9, 0x76, 0x19 ; \
	.byte	0xbe, 0xc9, 0xf6, 0x39 ; \
	.byte	0x8c, 0xca, 0x44, 0x00 ; \
	.byte	0x88, 0xca, 0xc4, 0x20 ; \
	.byte	0xb2, 0xca, 0x7e, 0x1d ; \
	.byte	0xb6, 0xca, 0xfe, 0x3d ; \
	.byte	0x80, 0xcb, 0x48, 0x06 ; \
	.byte	0x84, 0xcb, 0xc8, 0x26 ; \
	.byte	0xba, 0xcb, 0x76, 0x19 ; \
	.byte	0xbe, 0xcb, 0xf6, 0x39 ; \
	.byte	0x8c, 0xcc, 0x44, 0x00 ; \
	.byte	0x88, 0xcc, 0xc4, 0x20 ; \
	ldd	[%o0 + 0x10], %f0 ; \
	.byte	0xb2, 0xcc, 0x7e, 0x1d ; \
	ldd	[%o0 + 0x18], %f2 ; \
	.byte	0xb6, 0xcc, 0xfe, 0x3d ; \
	.byte	0xa8, 0xcd, 0x48, 0x86 ; \
	.byte	0xac, 0xcd, 0xc8, 0xa6 ; \
	ldd	[%o0 + 0x20], %f4 ; \
	.byte	0xba, 0xcd, 0x76, 0x99 ; \
	ldd	[%o0 + 0x28], %f6 ; \
	.byte	0xbe, 0xcd, 0xf6, 0xb9
	!aes_eround01	%f0, %f20, %f22, %f24 ; \
	!aes_eround23	%f2, %f20, %f22, %f22 ; \
	!ldd	[%o0 + 0x60], %f20 ; \
	!aes_eround01	%f0, %f60, %f62, %f56 ; \
	!aes_eround23	%f2, %f60, %f62, %f58 ; \
	!aes_eround01	%f4, %f24, %f22, %f0 ; \
	!aes_eround23	%f6, %f24, %f22, %f2 ; \
	!ldd	[%o0 + 0x68], %f22 ; \
	!aes_eround01	%f4, %f56, %f58, %f60 ; \
	!ldd	[%o0 + 0x70], %f24 ; \
	!aes_eround23	%f6, %f56, %f58, %f62 ; \
	!aes_eround01	%f8, %f0, %f2, %f6 ; \
	!aes_eround23	%f10, %f0, %f2, %f4 ; \
	!aes_eround01	%f8, %f60, %f62, %f56 ; \
	!aes_eround23	%f10, %f60, %f62, %f58 ; \
	!aes_eround01	%f12, %f6, %f4, %f0 ; \
	!aes_eround23	%f14, %f6, %f4, %f2 ; \
	!aes_eround01	%f12, %f56, %f58, %f60 ; \
	!aes_eround23	%f14, %f56, %f58, %f62 ; \
	!aes_eround01	%f16, %f0, %f2, %f6 ; \
	!aes_eround23	%f18, %f0, %f2, %f4 ; \
	!aes_eround01	%f16, %f60, %f62, %f56 ; \
	!aes_eround23	%f18, %f60, %f62, %f58 ; \
	!aes_eround01	%f20, %f6, %f4, %f0 ; \
	!aes_eround23	%f22, %f6, %f4, %f2 ; \
	!aes_eround01	%f20, %f56, %f58, %f60 ; \
	!aes_eround23	%f22, %f56, %f58, %f62 ; \
	!aes_eround01	%f24, %f0, %f2, %f6 ; \
	!aes_eround23	%f26, %f0, %f2, %f4 ; \
	!aes_eround01	%f24, %f60, %f62, %f56 ; \
	!aes_eround23	%f26, %f60, %f62, %f58 ; \
	!aes_eround01	%f28, %f6, %f4, %f0 ; \
	!aes_eround23	%f30, %f6, %f4, %f2 ; \
	!aes_eround01	%f28, %f56, %f58, %f60 ; \
	!aes_eround23	%f30, %f56, %f58, %f62 ; \
	!aes_eround01	%f32, %f0, %f2, %f6 ; \
	!aes_eround23	%f34, %f0, %f2, %f4 ; \
	!aes_eround01	%f32, %f60, %f62, %f56 ; \
	!aes_eround23	%f34, %f60, %f62, %f58 ; \
	!aes_eround01	%f36, %f6, %f4, %f0 ; \
	!aes_eround23	%f38, %f6, %f4, %f2 ; \
	!aes_eround01	%f36, %f56, %f58, %f60 ; \
	!aes_eround23	%f38, %f56, %f58, %f62 ; \
	!aes_eround01	%f40, %f0, %f2, %f6 ; \
	!aes_eround23	%f42, %f0, %f2, %f4 ; \
	!aes_eround01	%f40, %f60, %f62, %f56 ; \
	!aes_eround23	%f42, %f60, %f62, %f58 ; \
	!aes_eround01	%f44, %f6, %f4, %f0 ; \
	!aes_eround23	%f46, %f6, %f4, %f2 ; \
	!aes_eround01	%f44, %f56, %f58, %f60 ; \
	!aes_eround23	%f46, %f56, %f58, %f62 ; \
	!aes_eround01	%f48, %f0, %f2, %f6 ; \
	!aes_eround23	%f50, %f0, %f2, %f4 ; \
	!ldd	[%o0 + 0x10], %f0 ; \
	!aes_eround01	%f48, %f60, %f62, %f56 ; \
	!ldd	[%o0 + 0x18], %f2 ; \
	!aes_eround23	%f50, %f60, %f62, %f58 ; \
	!aes_eround01_l	%f52, %f6, %f4, %f20 ; \
	!aes_eround23_l	%f54, %f6, %f4, %f22 ; \
	!ldd	[%o0 + 0x20], %f4 ; \
	!aes_eround01_l	%f52, %f56, %f58, %f60 ; \
	!ldd	[%o0 + 0x28], %f6 ; \
	!aes_eround23_l	%f54, %f56, %f58, %f62

#define	FIRST_TWO_DROUNDS \
	.byte	0xb2, 0xc8, 0x3e, 0x5d ; \
	.byte	0xb6, 0xc8, 0xbe, 0x7d ; \
	.byte	0xba, 0xc9, 0x36, 0x59 ; \
	.byte	0xbe, 0xc9, 0xb6, 0x79
	!aes_dround01	%f0, %f60, %f62, %f56 ; \
	!aes_dround23	%f2, %f60, %f62, %f58 ; \
	!aes_dround01	%f4, %f56, %f58, %f60 ; \
	!aes_dround23	%f6, %f56, %f58, %f62

#define	MID_TWO_DROUNDS \
	.byte	0xb2, 0xca, 0x3e, 0x5d ; \
	.byte	0xb6, 0xca, 0xbe, 0x7d ; \
	.byte	0xba, 0xcb, 0x36, 0x59 ; \
	.byte	0xbe, 0xcb, 0xb6, 0x79
	!aes_dround01	%f8, %f60, %f62, %f56 ; \
	!aes_dround23	%f10, %f60, %f62, %f58 ; \
	!aes_dround01	%f12, %f56, %f58, %f60 ; \
	!aes_dround23	%f14, %f56, %f58, %f62

#define	MID_TWO_DROUNDS_2 \
	.byte	0x8c, 0xca, 0x04, 0x40 ; \
	.byte	0x88, 0xca, 0x84, 0x60 ; \
	.byte	0xb2, 0xca, 0x3e, 0x5d ; \
	.byte	0xb6, 0xca, 0xbe, 0x7d ; \
	.byte	0x80, 0xcb, 0x08, 0x46 ; \
	.byte	0x84, 0xcb, 0x88, 0x66 ; \
	.byte	0xba, 0xcb, 0x36, 0x59 ; \
	.byte	0xbe, 0xcb, 0xb6, 0x79
	!aes_dround01	%f8, %f0, %f2, %f6 ; \
	!aes_dround23	%f10, %f0, %f2, %f4 ; \
	!aes_dround01	%f8, %f60, %f62, %f56 ; \
	!aes_dround23	%f10, %f60, %f62, %f58 ; \
	!aes_dround01	%f12, %f6, %f4, %f0 ; \
	!aes_dround23	%f14, %f6, %f4, %f2 ; \
	!aes_dround01	%f12, %f56, %f58, %f60 ; \
	!aes_dround23	%f14, %f56, %f58, %f62

#define	TEN_DROUNDS \
	.byte	0xb2, 0xcc, 0x3e, 0x5d ; \
	.byte	0xb6, 0xcc, 0xbe, 0x7d ; \
	.byte	0xba, 0xcd, 0x36, 0x59 ; \
	.byte	0xbe, 0xcd, 0xb6, 0x79 ; \
	.byte	0xb2, 0xce, 0x3e, 0x5d ; \
	.byte	0xb6, 0xce, 0xbe, 0x7d ; \
	.byte	0xba, 0xcf, 0x36, 0x59 ; \
	.byte	0xbe, 0xcf, 0xb6, 0x79 ; \
	.byte	0xb2, 0xc8, 0x7e, 0x5d ; \
	.byte	0xb6, 0xc8, 0xfe, 0x7d ; \
	.byte	0xba, 0xc9, 0x76, 0x59 ; \
	.byte	0xbe, 0xc9, 0xf6, 0x79 ; \
	.byte	0xb2, 0xca, 0x7e, 0x5d ; \
	.byte	0xb6, 0xca, 0xfe, 0x7d ; \
	.byte	0xba, 0xcb, 0x76, 0x59 ; \
	.byte	0xbe, 0xcb, 0xf6, 0x79 ; \
	.byte	0xb2, 0xcc, 0x7e, 0x5d ; \
	.byte	0xb6, 0xcc, 0xfe, 0x7d ; \
	.byte	0xba, 0xcd, 0x76, 0xd9 ; \
	.byte	0xbe, 0xcd, 0xf6, 0xf9
	!aes_dround01	%f16, %f60, %f62, %f56 ; \
	!aes_dround23	%f18, %f60, %f62, %f58 ; \
	!aes_dround01	%f20, %f56, %f58, %f60 ; \
	!aes_dround23	%f22, %f56, %f58, %f62 ; \
	!aes_dround01	%f24, %f60, %f62, %f56 ; \
	!aes_dround23	%f26, %f60, %f62, %f58 ; \
	!aes_dround01	%f28, %f56, %f58, %f60 ; \
	!aes_dround23	%f30, %f56, %f58, %f62 ; \
	!aes_dround01	%f32, %f60, %f62, %f56 ; \
	!aes_dround23	%f34, %f60, %f62, %f58 ; \
	!aes_dround01	%f36, %f56, %f58, %f60 ; \
	!aes_dround23	%f38, %f56, %f58, %f62 ; \
	!aes_dround01	%f40, %f60, %f62, %f56 ; \
	!aes_dround23	%f42, %f60, %f62, %f58 ; \
	!aes_dround01	%f44, %f56, %f58, %f60 ; \
	!aes_dround23	%f46, %f56, %f58, %f62 ; \
	!aes_dround01	%f48, %f60, %f62, %f56 ; \
	!aes_dround23	%f50, %f60, %f62, %f58 ; \
	!aes_dround01_l	%f52, %f56, %f58, %f60 ; \
	!aes_dround23_l	%f54, %f56, %f58, %f62

#define	TEN_DROUNDS_2 \
	.byte	0x8c, 0xcc, 0x04, 0x40 ; \
	.byte	0x88, 0xcc, 0x84, 0x60 ; \
	.byte	0xb2, 0xcc, 0x3e, 0x5d ; \
	.byte	0xb6, 0xcc, 0xbe, 0x7d ; \
	.byte	0x80, 0xcd, 0x08, 0x46 ; \
	.byte	0x84, 0xcd, 0x88, 0x66 ; \
	.byte	0xba, 0xcd, 0x36, 0x59 ; \
	.byte	0xbe, 0xcd, 0xb6, 0x79 ; \
	.byte	0x8c, 0xce, 0x04, 0x40 ; \
	.byte	0x88, 0xce, 0x84, 0x60 ; \
	.byte	0xb2, 0xce, 0x3e, 0x5d ; \
	.byte	0xb6, 0xce, 0xbe, 0x7d ; \
	.byte	0x80, 0xcf, 0x08, 0x46 ; \
	.byte	0x84, 0xcf, 0x88, 0x66 ; \
	.byte	0xba, 0xcf, 0x36, 0x59 ; \
	.byte	0xbe, 0xcf, 0xb6, 0x79 ; \
	.byte	0x8c, 0xc8, 0x44, 0x40 ; \
	.byte	0x88, 0xc8, 0xc4, 0x60 ; \
	.byte	0xb2, 0xc8, 0x7e, 0x5d ; \
	.byte	0xb6, 0xc8, 0xfe, 0x7d ; \
	.byte	0x80, 0xc9, 0x48, 0x46 ; \
	.byte	0x84, 0xc9, 0xc8, 0x66 ; \
	.byte	0xba, 0xc9, 0x76, 0x59 ; \
	.byte	0xbe, 0xc9, 0xf6, 0x79 ; \
	.byte	0x8c, 0xca, 0x44, 0x40 ; \
	.byte	0x88, 0xca, 0xc4, 0x60 ; \
	.byte	0xb2, 0xca, 0x7e, 0x5d ; \
	.byte	0xb6, 0xca, 0xfe, 0x7d ; \
	.byte	0x80, 0xcb, 0x48, 0x46 ; \
	.byte	0x84, 0xcb, 0xc8, 0x66 ; \
	.byte	0xba, 0xcb, 0x76, 0x59 ; \
	.byte	0xbe, 0xcb, 0xf6, 0x79 ; \
	.byte	0x8c, 0xcc, 0x44, 0x40 ; \
	.byte	0x88, 0xcc, 0xc4, 0x60 ; \
	.byte	0xb2, 0xcc, 0x7e, 0x5d ; \
	.byte	0xb6, 0xcc, 0xfe, 0x7d ; \
	.byte	0x80, 0xcd, 0x48, 0xc6 ; \
	.byte	0x84, 0xcd, 0xc8, 0xe6 ; \
	.byte	0xba, 0xcd, 0x76, 0xd9 ; \
	.byte	0xbe, 0xcd, 0xf6, 0xf9
	!aes_dround01	%f16, %f0, %f2, %f6 ; \
	!aes_dround23	%f18, %f0, %f2, %f4 ; \
	!aes_dround01	%f16, %f60, %f62, %f56 ; \
	!aes_dround23	%f18, %f60, %f62, %f58 ; \
	!aes_dround01	%f20, %f6, %f4, %f0 ; \
	!aes_dround23	%f22, %f6, %f4, %f2 ; \
	!aes_dround01	%f20, %f56, %f58, %f60 ; \
	!aes_dround23	%f22, %f56, %f58, %f62 ; \
	!aes_dround01	%f24, %f0, %f2, %f6 ; \
	!aes_dround23	%f26, %f0, %f2, %f4 ; \
	!aes_dround01	%f24, %f60, %f62, %f56 ; \
	!aes_dround23	%f26, %f60, %f62, %f58 ; \
	!aes_dround01	%f28, %f6, %f4, %f0 ; \
	!aes_dround23	%f30, %f6, %f4, %f2 ; \
	!aes_dround01	%f28, %f56, %f58, %f60 ; \
	!aes_dround23	%f30, %f56, %f58, %f62 ; \
	!aes_dround01	%f32, %f0, %f2, %f6 ; \
	!aes_dround23	%f34, %f0, %f2, %f4 ; \
	!aes_dround01	%f32, %f60, %f62, %f56 ; \
	!aes_dround23	%f34, %f60, %f62, %f58 ; \
	!aes_dround01	%f36, %f6, %f4, %f0 ; \
	!aes_dround23	%f38, %f6, %f4, %f2 ; \
	!aes_dround01	%f36, %f56, %f58, %f60 ; \
	!aes_dround23	%f38, %f56, %f58, %f62 ; \
	!aes_dround01	%f40, %f0, %f2, %f6 ; \
	!aes_dround23	%f42, %f0, %f2, %f4 ; \
	!aes_dround01	%f40, %f60, %f62, %f56 ; \
	!aes_dround23	%f42, %f60, %f62, %f58 ; \
	!aes_dround01	%f44, %f6, %f4, %f0 ; \
	!aes_dround23	%f46, %f6, %f4, %f2 ; \
	!aes_dround01	%f44, %f56, %f58, %f60 ; \
	!aes_dround23	%f46, %f56, %f58, %f62 ; \
	!aes_dround01	%f48, %f0, %f2, %f6 ; \
	!aes_dround23	%f50, %f0, %f2, %f4 ; \
	!aes_dround01	%f48, %f60, %f62, %f56 ; \
	!aes_dround23	%f50, %f60, %f62, %f58 ; \
	!aes_dround01_l	%f52, %f6, %f4, %f0 ; \
	!aes_dround23_l	%f54, %f6, %f4, %f2 ; \
	!aes_dround01_l	%f52, %f56, %f58, %f60 ; \
	!aes_dround23_l	%f54, %f56, %f58, %f62

#define	TWELVE_DROUNDS \
	MID_TWO_DROUNDS	; \
	TEN_DROUNDS

#define	TWELVE_DROUNDS_2 \
	MID_TWO_DROUNDS_2	; \
	TEN_DROUNDS_2

#define	FOURTEEN_DROUNDS \
	FIRST_TWO_DROUNDS ; \
	TWELVE_DROUNDS

#define	FOURTEEN_DROUNDS_2 \
	.byte	0xb0, 0xc8, 0x2c, 0x54 ; \
	.byte	0xac, 0xc8, 0xac, 0x74 ; \
	ldd	[%o0 + 0x80], %f20 ; \
	.byte	0xb2, 0xc8, 0x3e, 0x5d ; \
	.byte	0xb6, 0xc8, 0xbe, 0x7d ; \
	.byte	0x80, 0xc9, 0x2c, 0x58 ; \
	.byte	0x84, 0xc9, 0xac, 0x78 ; \
	ldd	[%o0 + 0x88], %f22 ; \
	.byte	0xba, 0xc9, 0x36, 0x59 ; \
	ldd	[%o0 + 0x70], %f24 ; \
	.byte	0xbe, 0xc9, 0xb6, 0x79 ; \
	.byte	0x8c, 0xca, 0x04, 0x40 ; \
	.byte	0x88, 0xca, 0x84, 0x60 ; \
	.byte	0xb2, 0xca, 0x3e, 0x5d ; \
	.byte	0xb6, 0xca, 0xbe, 0x7d ; \
	.byte	0x80, 0xcb, 0x08, 0x46 ; \
	.byte	0x84, 0xcb, 0x88, 0x66 ; \
	.byte	0xba, 0xcb, 0x36, 0x59 ; \
	.byte	0xbe, 0xcb, 0xb6, 0x79 ; \
	.byte	0x8c, 0xcc, 0x04, 0x40 ; \
	.byte	0x88, 0xcc, 0x84, 0x60 ; \
	.byte	0xb2, 0xcc, 0x3e, 0x5d ; \
	.byte	0xb6, 0xcc, 0xbe, 0x7d ; \
	.byte	0x80, 0xcd, 0x08, 0x46 ; \
	.byte	0x84, 0xcd, 0x88, 0x66 ; \
	.byte	0xba, 0xcd, 0x36, 0x59 ; \
	.byte	0xbe, 0xcd, 0xb6, 0x79 ; \
	.byte	0x8c, 0xce, 0x04, 0x40 ; \
	.byte	0x88, 0xce, 0x84, 0x60 ; \
	.byte	0xb2, 0xce, 0x3e, 0x5d ; \
	.byte	0xb6, 0xce, 0xbe, 0x7d ; \
	.byte	0x80, 0xcf, 0x08, 0x46 ; \
	.byte	0x84, 0xcf, 0x88, 0x66 ; \
	.byte	0xba, 0xcf, 0x36, 0x59 ; \
	.byte	0xbe, 0xcf, 0xb6, 0x79 ; \
	.byte	0x8c, 0xc8, 0x44, 0x40 ; \
	.byte	0x88, 0xc8, 0xc4, 0x60 ; \
	.byte	0xb2, 0xc8, 0x7e, 0x5d ; \
	.byte	0xb6, 0xc8, 0xfe, 0x7d ; \
	.byte	0x80, 0xc9, 0x48, 0x46 ; \
	.byte	0x84, 0xc9, 0xc8, 0x66 ; \
	.byte	0xba, 0xc9, 0x76, 0x59 ; \
	.byte	0xbe, 0xc9, 0xf6, 0x79 ; \
	.byte	0x8c, 0xca, 0x44, 0x40 ; \
	.byte	0x88, 0xca, 0xc4, 0x60 ; \
	.byte	0xb2, 0xca, 0x7e, 0x5d ; \
	.byte	0xb6, 0xca, 0xfe, 0x7d ; \
	.byte	0x80, 0xcb, 0x48, 0x46 ; \
	.byte	0x84, 0xcb, 0xc8, 0x66 ; \
	.byte	0xba, 0xcb, 0x76, 0x59 ; \
	.byte	0xbe, 0xcb, 0xf6, 0x79 ; \
	.byte	0x8c, 0xcc, 0x44, 0x40 ; \
	.byte	0x88, 0xcc, 0xc4, 0x60 ; \
	ldd	[%o0 + 0xd0], %f0 ; \
	.byte	0xb2, 0xcc, 0x7e, 0x5d ; \
	ldd	[%o0 + 0xd8], %f2 ; \
	.byte	0xb6, 0xcc, 0xfe, 0x7d ; \
	.byte	0xa8, 0xcd, 0x48, 0xc6 ; \
	.byte	0xac, 0xcd, 0xc8, 0xe6 ; \
	ldd	[%o0 + 0xc0], %f4 ; \
	.byte	0xba, 0xcd, 0x76, 0xd9 ; \
	ldd	[%o0 + 0xc8], %f6 ; \
	.byte	0xbe, 0xcd, 0xf6, 0xf9
	!aes_dround01	%f0, %f20, %f22, %f24 ; \
	!aes_dround23	%f2, %f20, %f22, %f22 ; \
	!ldd	[%o0 + 0x80], %f20 ; \
	!aes_dround01	%f0, %f60, %f62, %f56 ; \
	!aes_dround23	%f2, %f60, %f62, %f58 ; \
	!aes_dround01	%f4, %f24, %f22, %f0 ; \
	!aes_dround23	%f6, %f24, %f22, %f2 ; \
	!ldd	[%o0 + 0x88], %f22 ; \
	!aes_dround01	%f4, %f56, %f58, %f60 ; \
	!ldd	[%o0 + 0x70], %f24 ; \
	!aes_dround23	%f6, %f56, %f58, %f62 ; \
	!aes_dround01	%f8, %f0, %f2, %f6 ; \
	!aes_dround23	%f10, %f0, %f2, %f4 ; \
	!aes_dround01	%f8, %f60, %f62, %f56 ; \
	!aes_dround23	%f10, %f60, %f62, %f58 ; \
	!aes_dround01	%f12, %f6, %f4, %f0 ; \
	!aes_dround23	%f14, %f6, %f4, %f2 ; \
	!aes_dround01	%f12, %f56, %f58, %f60 ; \
	!aes_dround23	%f14, %f56, %f58, %f62 ; \
	!aes_dround01	%f16, %f0, %f2, %f6 ; \
	!aes_dround23	%f18, %f0, %f2, %f4 ; \
	!aes_dround01	%f16, %f60, %f62, %f56 ; \
	!aes_dround23	%f18, %f60, %f62, %f58 ; \
	!aes_dround01	%f20, %f6, %f4, %f0 ; \
	!aes_dround23	%f22, %f6, %f4, %f2 ; \
	!aes_dround01	%f20, %f56, %f58, %f60 ; \
	!aes_dround23	%f22, %f56, %f58, %f62 ; \
	!aes_dround01	%f24, %f0, %f2, %f6 ; \
	!aes_dround23	%f26, %f0, %f2, %f4 ; \
	!aes_dround01	%f24, %f60, %f62, %f56 ; \
	!aes_dround23	%f26, %f60, %f62, %f58 ; \
	!aes_dround01	%f28, %f6, %f4, %f0 ; \
	!aes_dround23	%f30, %f6, %f4, %f2 ; \
	!aes_dround01	%f28, %f56, %f58, %f60 ; \
	!aes_dround23	%f30, %f56, %f58, %f62 ; \
	!aes_dround01	%f32, %f0, %f2, %f6 ; \
	!aes_dround23	%f34, %f0, %f2, %f4 ; \
	!aes_dround01	%f32, %f60, %f62, %f56 ; \
	!aes_dround23	%f34, %f60, %f62, %f58 ; \
	!aes_dround01	%f36, %f6, %f4, %f0 ; \
	!aes_dround23	%f38, %f6, %f4, %f2 ; \
	!aes_dround01	%f36, %f56, %f58, %f60 ; \
	!aes_dround23	%f38, %f56, %f58, %f62 ; \
	!aes_dround01	%f40, %f0, %f2, %f6 ; \
	!aes_dround23	%f42, %f0, %f2, %f4 ; \
	!aes_dround01	%f40, %f60, %f62, %f56 ; \
	!aes_dround23	%f42, %f60, %f62, %f58 ; \
	!aes_dround01	%f44, %f6, %f4, %f0 ; \
	!aes_dround23	%f46, %f6, %f4, %f2 ; \
	!aes_dround01	%f44, %f56, %f58, %f60 ; \
	!aes_dround23	%f46, %f56, %f58, %f62 ; \
	!aes_dround01	%f48, %f0, %f2, %f6 ; \
	!aes_dround23	%f50, %f0, %f2, %f4 ; \
	!ldd	[%o0 + 0xd0], %f0 ; \
	!aes_dround01	%f48, %f60, %f62, %f56 ; \
	!ldd	[%o0 + 0xd8], %f2 ; \
	!aes_dround23	%f50, %f60, %f62, %f58 ; \
	!aes_dround01_l	%f52, %f6, %f4, %f20 ; \
	!aes_dround23_l	%f54, %f6, %f4, %f22 ; \
	!ldd	[%o0 + 0xc0], %f4 ; \
	!aes_dround01_l	%f52, %f56, %f58, %f60 ; \
	!ldd	[%o0 + 0xc8], %f6 ; \
	!aes_dround23_l	%f54, %f56, %f58, %f62


	ENTRY(t4_aes128_load_keys_for_encrypt)

	ldd	[%o0 + 0x10], %f16
	ldd	[%o0 + 0x18], %f18
	ldd	[%o0 + 0x20], %f20
	ldd	[%o0 + 0x28], %f22
	ldd	[%o0 + 0x30], %f24
	ldd	[%o0 + 0x38], %f26
	ldd	[%o0 + 0x40], %f28
	ldd	[%o0 + 0x48], %f30
	ldd	[%o0 + 0x50], %f32
	ldd	[%o0 + 0x58], %f34
	ldd	[%o0 + 0x60], %f36
	ldd	[%o0 + 0x68], %f38
	ldd	[%o0 + 0x70], %f40
	ldd	[%o0 + 0x78], %f42
	ldd	[%o0 + 0x80], %f44
	ldd	[%o0 + 0x88], %f46
	ldd	[%o0 + 0x90], %f48
	ldd	[%o0 + 0x98], %f50
	ldd	[%o0 + 0xa0], %f52
	retl
	ldd	[%o0 + 0xa8], %f54

	SET_SIZE(t4_aes128_load_keys_for_encrypt)


	ENTRY(t4_aes192_load_keys_for_encrypt)

	ldd	[%o0 + 0x10], %f8
	ldd	[%o0 + 0x18], %f10
	ldd	[%o0 + 0x20], %f12
	ldd	[%o0 + 0x28], %f14
	ldd	[%o0 + 0x30], %f16
	ldd	[%o0 + 0x38], %f18
	ldd	[%o0 + 0x40], %f20
	ldd	[%o0 + 0x48], %f22
	ldd	[%o0 + 0x50], %f24
	ldd	[%o0 + 0x58], %f26
	ldd	[%o0 + 0x60], %f28
	ldd	[%o0 + 0x68], %f30
	ldd	[%o0 + 0x70], %f32
	ldd	[%o0 + 0x78], %f34
	ldd	[%o0 + 0x80], %f36
	ldd	[%o0 + 0x88], %f38
	ldd	[%o0 + 0x90], %f40
	ldd	[%o0 + 0x98], %f42
	ldd	[%o0 + 0xa0], %f44
	ldd	[%o0 + 0xa8], %f46
	ldd	[%o0 + 0xb0], %f48
	ldd	[%o0 + 0xb8], %f50
	ldd	[%o0 + 0xc0], %f52
	retl
	ldd	[%o0 + 0xc8], %f54

	SET_SIZE(t4_aes192_load_keys_for_encrypt)


	ENTRY(t4_aes256_load_keys_for_encrypt)

	ldd	[%o0 + 0x10], %f0
	ldd	[%o0 + 0x18], %f2
	ldd	[%o0 + 0x20], %f4
	ldd	[%o0 + 0x28], %f6
	ldd	[%o0 + 0x30], %f8
	ldd	[%o0 + 0x38], %f10
	ldd	[%o0 + 0x40], %f12
	ldd	[%o0 + 0x48], %f14
	ldd	[%o0 + 0x50], %f16
	ldd	[%o0 + 0x58], %f18
	ldd	[%o0 + 0x60], %f20
	ldd	[%o0 + 0x68], %f22
	ldd	[%o0 + 0x70], %f24
	ldd	[%o0 + 0x78], %f26
	ldd	[%o0 + 0x80], %f28
	ldd	[%o0 + 0x88], %f30
	ldd	[%o0 + 0x90], %f32
	ldd	[%o0 + 0x98], %f34
	ldd	[%o0 + 0xa0], %f36
	ldd	[%o0 + 0xa8], %f38
	ldd	[%o0 + 0xb0], %f40
	ldd	[%o0 + 0xb8], %f42
	ldd	[%o0 + 0xc0], %f44
	ldd	[%o0 + 0xc8], %f46
	ldd	[%o0 + 0xd0], %f48
	ldd	[%o0 + 0xd8], %f50
	ldd	[%o0 + 0xe0], %f52
	retl
	ldd	[%o0 + 0xe8], %f54

	SET_SIZE(t4_aes256_load_keys_for_encrypt)


#define	TEST_PARALLEL_ECB_ENCRYPT
#ifdef	TEST_PARALLEL_ECB_ENCRYPT
	ENTRY(t4_aes128_ecb_encrypt)

	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %o4
	brz	%o4, ecbenc128_loop
	nop

	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	TEN_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ecbenc128_loop_end
	add	%o2, 16, %o2

ecbenc128_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f0
	movxtod	%g4, %f2
	ldx	[%o1 + 16], %g3	!input
	ldx	[%o1 + 24], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	TEN_EROUNDS_2

	std	%f0, [%o2]
	std	%f2, [%o2 + 8]

	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ecbenc128_loop
	add	%o2, 32, %o2
ecbenc128_loop_end:
	retl
	nop

	SET_SIZE(t4_aes128_ecb_encrypt)


	ENTRY(t4_aes192_ecb_encrypt)

	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %o4
	brz	%o4, ecbenc192_loop
	nop

	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	TWELVE_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ecbenc192_loop_end
	add	%o2, 16, %o2

ecbenc192_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f0
	movxtod	%g4, %f2
	ldx	[%o1 + 16], %g3	!input
	ldx	[%o1 + 24], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	TWELVE_EROUNDS_2

	std	%f0, [%o2]
	std	%f2, [%o2 + 8]

	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ecbenc192_loop
	add	%o2, 32, %o2
ecbenc192_loop_end:
	retl
	nop

	SET_SIZE(t4_aes192_ecb_encrypt)


	ENTRY(t4_aes256_ecb_encrypt)

	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %o4
	brz	%o4, ecbenc256_loop
	nop

	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	FOURTEEN_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ecbenc256_loop_end
	add	%o2, 16, %o2

ecbenc256_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f20
	movxtod	%g4, %f22
	ldx	[%o1 + 16], %g3	!input
	ldx	[%o1 + 24], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	FOURTEEN_EROUNDS_2

	std	%f20, [%o2]
	std	%f22, [%o2 + 8]

	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ecbenc256_loop
	add	%o2, 32, %o2

	ldd	[%o0 + 0x60], %f20
	ldd	[%o0 + 0x68], %f22

ecbenc256_loop_end:
	retl
	nop

	SET_SIZE(t4_aes256_ecb_encrypt)

#else

	ENTRY(t4_aes128_ecb_encrypt)

	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

ecbenc128_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	TEN_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ecbenc128_loop
	add	%o2, 16, %o2

	retl
	nop

	SET_SIZE(t4_aes128_ecb_encrypt)


	ENTRY(t4_aes192_ecb_encrypt)

	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

ecbenc192_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	TWELVE_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ecbenc192_loop
	add	%o2, 16, %o2

	retl
	nop

	SET_SIZE(t4_aes192_ecb_encrypt)


	ENTRY(t4_aes256_ecb_encrypt)

	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

ecbenc256_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f60
	movxtod	%g4, %f62

	FOURTEEN_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ecbenc256_loop
	add	%o2, 16, %o2

	retl
	nop

	SET_SIZE(t4_aes256_ecb_encrypt)
#endif


	ENTRY(t4_aes128_cbc_encrypt)

	ldd	[%o4], %f60	! IV
	ldd	[%o4 +8], %f62	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cbcenc128_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f56
	movxtod	%g4, %f58
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	TEN_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cbcenc128_loop
	add	%o2, 16, %o2

	std	%f60, [%o4]
	retl
	std	%f62, [%o4 + 8]

	SET_SIZE(t4_aes128_cbc_encrypt)


	ENTRY(t4_aes192_cbc_encrypt)

	ldd	[%o4], %f60	! IV
	ldd	[%o4 + 8], %f62	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cbcenc192_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f56
	movxtod	%g4, %f58
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	TWELVE_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cbcenc192_loop
	add	%o2, 16, %o2

	std	%f60, [%o4]
	retl
	std	%f62, [%o4 + 8]

	SET_SIZE(t4_aes192_cbc_encrypt)


	ENTRY(t4_aes256_cbc_encrypt)

	ldd	[%o4], %f60	! IV
	ldd	[%o4 + 8], %f62	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cbcenc256_loop:
	ldx	[%o1], %g3	!input
	ldx	[%o1 + 8], %g4	!input
	xor	%g1, %g3, %g3	!input ^ ks[0-1]
	xor	%g2, %g4, %g4	!input ^ ks[0-1]
	movxtod	%g3, %f56
	movxtod	%g4, %f58
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	FOURTEEN_EROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cbcenc256_loop
	add	%o2, 16, %o2

	std	%f60, [%o4]
	retl
	std	%f62, [%o4 + 8]

	SET_SIZE(t4_aes256_cbc_encrypt)


#define	 TEST_PARALLEL_CTR_CRYPT
#ifdef	TEST_PARALLEL_CTR_CRYPT
	ENTRY(t4_aes128_ctr_crypt)

	ldx	[%o4], %g3	! IV
	ldx	[%o4 +8], %g4	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %g5
	brz, %g5, ctr128_loop

	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	TEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ctr128_loop_end
	add	%o2, 16, %o2

ctr128_loop:
	xor	%g1, %g3, %g5
	movxtod	%g5, %f0
	xor	%g2, %g4, %g5
	movxtod	%g5, %f2
	inc	%g4

	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	TEN_EROUNDS_2

	ldd	[%o1], %f6		!input
	ldd	[%o1 + 8], %f4		!input
	ldd	[%o1 + 16], %f56	!input
	ldd	[%o1 + 24], %f58	!input
	fxor	%f0, %f6, %f0
	fxor	%f2, %f4, %f2
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f0, [%o2]
	std	%f2, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ctr128_loop
	add	%o2, 32, %o2

ctr128_loop_end:
	stx	%g3, [%o4]
	retl
	stx	%g4, [%o4 + 8]

	SET_SIZE(t4_aes128_ctr_crypt)


	ENTRY(t4_aes192_ctr_crypt)

	ldx	[%o4], %g3	! IV
	ldx	[%o4 +8], %g4	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %g5
	brz, %g5, ctr192_loop

	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	TWELVE_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ctr192_loop_end
	add	%o2, 16, %o2

ctr192_loop:
	xor	%g1, %g3, %g5
	movxtod	%g5, %f0
	xor	%g2, %g4, %g5
	movxtod	%g5, %f2
	inc	%g4

	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	TWELVE_EROUNDS_2

	ldd	[%o1], %f6		!input
	ldd	[%o1 + 8], %f4		!input
	ldd	[%o1 + 16], %f56	!input
	ldd	[%o1 + 24], %f58	!input
	fxor	%f0, %f6, %f0
	fxor	%f2, %f4, %f2
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f0, [%o2]
	std	%f2, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ctr192_loop
	add	%o2, 32, %o2

ctr192_loop_end:
	stx	%g3, [%o4]
	retl
	stx	%g4, [%o4 + 8]

	SET_SIZE(t4_aes192_ctr_crypt)


	ENTRY(t4_aes256_ctr_crypt)

	ldx	[%o4], %g3	! IV
	ldx	[%o4 +8], %g4	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %g5
	brz,	%g5, ctr256_loop

	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	FOURTEEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ctr256_loop_end
	add	%o2, 16, %o2

ctr256_loop:
	xor	%g1, %g3, %g5
	movxtod	%g5, %f20
	xor	%g2, %g4, %g5
	movxtod	%g5, %f22
	inc	%g4

	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	FOURTEEN_EROUNDS_2

	ldd	[%o1], %f56		!input
	ldd	[%o1 + 8], %f58		!input
	fxor	%f20, %f56, %f20
	fxor	%f22, %f58, %f22
	ldd	[%o1 + 16], %f56	!input
	ldd	[%o1 + 24], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f20, [%o2]
	std	%f22, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ctr256_loop
	add	%o2, 32, %o2

	ldd	[%o0 + 0x60], %f20
	ldd	[%o0 + 0x68], %f22

ctr256_loop_end:
	stx	%g3, [%o4]
	retl
	stx	%g4, [%o4 + 8]

	SET_SIZE(t4_aes256_ctr_crypt)

#else

	ENTRY(t4_aes128_ctr_crypt)

	ldx	[%o4], %g3	! IV
	ldx	[%o4 +8], %g4	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

ctr128_loop:
	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	TEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ctr128_loop
	add	%o2, 16, %o2

	stx	%g3, [%o4]
	retl
	stx	%g4, [%o4 + 8]

	SET_SIZE(t4_aes128_ctr_crypt)

	ENTRY(t4_aes192_ctr_crypt)

	ldx	[%o4], %g3	! IV
	ldx	[%o4 +8], %g4	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

ctr192_loop:
	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	TWELVE_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ctr192_loop
	add	%o2, 16, %o2

	stx	%g3, [%o4]
	retl
	stx	%g4, [%o4 + 8]

	SET_SIZE(t4_aes192_ctr_crypt)


	ENTRY(t4_aes256_ctr_crypt)

	ldx	[%o4], %g3	! IV
	ldx	[%o4 +8], %g4	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

ctr256_loop:
	xor	%g1, %g3, %g5
	movxtod	%g5, %f60
	xor	%g2, %g4, %g5
	movxtod	%g5, %f62
	inc	%g4

	FOURTEEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62
	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ctr256_loop
	add	%o2, 16, %o2

	stx	%g3, [%o4]
	retl
	stx	%g4, [%o4 + 8]

	SET_SIZE(t4_aes256_ctr_crypt)

#endif

	ENTRY(t4_aes128_cfb128_encrypt)

	ldd	[%o4], %f60	! IV
	ldd	[%o4 +8], %f62	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cfb128_128_loop:
	movxtod	%g1, %f56
	movxtod	%g2, %f58
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	TEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cfb128_128_loop
	add	%o2, 16, %o2

	std	%f60, [%o4]
	retl
	std	%f62, [%o4 + 8]

	SET_SIZE(t4_aes128_cfb128_encrypt)


	ENTRY(t4_aes192_cfb128_encrypt)

	ldd	[%o4], %f60	! IV
	ldd	[%o4 +8], %f62	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cfb128_192_loop:
	movxtod	%g1, %f56
	movxtod	%g2, %f58
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	TWELVE_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cfb128_192_loop
	add	%o2, 16, %o2

	std	%f60, [%o4]
	retl
	std	%f62, [%o4 + 8]

	SET_SIZE(t4_aes192_cfb128_encrypt)


	ENTRY(t4_aes256_cfb128_encrypt)

	ldd	[%o4], %f60	! IV
	ldd	[%o4 +8], %f62	! IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cfb128_256_loop:
	movxtod	%g1, %f56
	movxtod	%g2, %f58
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	FOURTEEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cfb128_256_loop
	add	%o2, 16, %o2

	std	%f60, [%o4]
	retl
	std	%f62, [%o4 + 8]

	SET_SIZE(t4_aes256_cfb128_encrypt)


	ENTRY(t4_aes128_load_keys_for_decrypt)

	ldd	[%o0], %f52
	ldd	[%o0 + 0x8], %f54
	ldd	[%o0 + 0x10], %f48
	ldd	[%o0 + 0x18], %f50
	ldd	[%o0 + 0x20], %f44
	ldd	[%o0 + 0x28], %f46
	ldd	[%o0 + 0x30], %f40
	ldd	[%o0 + 0x38], %f42
	ldd	[%o0 + 0x40], %f36
	ldd	[%o0 + 0x48], %f38
	ldd	[%o0 + 0x50], %f32
	ldd	[%o0 + 0x58], %f34
	ldd	[%o0 + 0x60], %f28
	ldd	[%o0 + 0x68], %f30
	ldd	[%o0 + 0x70], %f24
	ldd	[%o0 + 0x78], %f26
	ldd	[%o0 + 0x80], %f20
	ldd	[%o0 + 0x88], %f22
	ldd	[%o0 + 0x90], %f16
	retl
	ldd	[%o0 + 0x98], %f18

	SET_SIZE(t4_aes128_load_keys_for_decrypt)


	ENTRY(t4_aes192_load_keys_for_decrypt)

	ldd	[%o0], %f52
	ldd	[%o0 + 0x8], %f54
	ldd	[%o0 + 0x10], %f48
	ldd	[%o0 + 0x18], %f50
	ldd	[%o0 + 0x20], %f44
	ldd	[%o0 + 0x28], %f46
	ldd	[%o0 + 0x30], %f40
	ldd	[%o0 + 0x38], %f42
	ldd	[%o0 + 0x40], %f36
	ldd	[%o0 + 0x48], %f38
	ldd	[%o0 + 0x50], %f32
	ldd	[%o0 + 0x58], %f34
	ldd	[%o0 + 0x60], %f28
	ldd	[%o0 + 0x68], %f30
	ldd	[%o0 + 0x70], %f24
	ldd	[%o0 + 0x78], %f26
	ldd	[%o0 + 0x80], %f20
	ldd	[%o0 + 0x88], %f22
	ldd	[%o0 + 0x90], %f16
	ldd	[%o0 + 0x98], %f18
	ldd	[%o0 + 0xa0], %f12
	ldd	[%o0 + 0xa8], %f14
	ldd	[%o0 + 0xb0], %f8
	retl
	ldd	[%o0 + 0xb8], %f10

	SET_SIZE(t4_aes192_load_keys_for_decrypt)


	ENTRY(t4_aes256_load_keys_for_decrypt)


	ldd	[%o0], %f52
	ldd	[%o0 + 0x8], %f54
	ldd	[%o0 + 0x10], %f48
	ldd	[%o0 + 0x18], %f50
	ldd	[%o0 + 0x20], %f44
	ldd	[%o0 + 0x28], %f46
	ldd	[%o0 + 0x30], %f40
	ldd	[%o0 + 0x38], %f42
	ldd	[%o0 + 0x40], %f36
	ldd	[%o0 + 0x48], %f38
	ldd	[%o0 + 0x50], %f32
	ldd	[%o0 + 0x58], %f34
	ldd	[%o0 + 0x60], %f28
	ldd	[%o0 + 0x68], %f30
	ldd	[%o0 + 0x70], %f24
	ldd	[%o0 + 0x78], %f26
	ldd	[%o0 + 0x80], %f20
	ldd	[%o0 + 0x88], %f22
	ldd	[%o0 + 0x90], %f16
	ldd	[%o0 + 0x98], %f18
	ldd	[%o0 + 0xa0], %f12
	ldd	[%o0 + 0xa8], %f14
	ldd	[%o0 + 0xb0], %f8
	ldd	[%o0 + 0xb8], %f10
	ldd	[%o0 + 0xc0], %f4
	ldd	[%o0 + 0xc8], %f6
	ldd	[%o0 + 0xd0], %f0
	retl
	ldd	[%o0 + 0xd8], %f2

	SET_SIZE(t4_aes256_load_keys_for_decrypt)


#define	 TEST_PARALLEL_ECB_DECRYPT
#ifdef	TEST_PARALLEL_ECB_DECRYPT
	ENTRY(t4_aes128_ecb_decrypt)

	ldx	[%o0 + 0xa0], %g1	!ks[last-1]
	ldx	[%o0 + 0xa8], %g2	!ks[last]
	and	%o3, 16, %o4
	brz	%o4, ecbdec128_loop
	nop

	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	TEN_DROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 0x8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ecbdec128_loop_end
	add	%o2, 16, %o2

ecbdec128_loop:
	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f0
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f2
	ldx	[%o1 + 16], %o4
	ldx	[%o1 + 24], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	TEN_DROUNDS_2

	std	%f0, [%o2]
	std	%f2, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ecbdec128_loop
	add	%o2, 32, %o2
ecbdec128_loop_end:

	retl
	nop

	SET_SIZE(t4_aes128_ecb_decrypt)

	ENTRY(t4_aes192_ecb_decrypt)

	ldx	[%o0 + 0xc0], %g1	!ks[last-1]
	ldx	[%o0 + 0xc8], %g2	!ks[last]
	and	%o3, 16, %o4
	brz	%o4, ecbdec192_loop
	nop

	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	TWELVE_DROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 0x8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ecbdec192_loop_end
	add	%o2, 16, %o2

ecbdec192_loop:
	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f0
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f2
	ldx	[%o1 + 16], %o4
	ldx	[%o1 + 24], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	TWELVE_DROUNDS_2

	std	%f0, [%o2]
	std	%f2, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ecbdec192_loop
	add	%o2, 32, %o2
ecbdec192_loop_end:

	retl
	nop

	SET_SIZE(t4_aes192_ecb_decrypt)


	ENTRY(t4_aes256_ecb_decrypt)

	ldx	[%o0 + 0xe0], %g1	!ks[last-1]
	ldx	[%o0 + 0xe8], %g2	!ks[last]
	and	%o3, 16, %o4
	brz	%o4, ecbdec256_loop
	nop

	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	FOURTEEN_DROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 0x8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	ecbdec256_loop_end
	add	%o2, 16, %o2

ecbdec256_loop:
	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f20
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f22
	ldx	[%o1 + 16], %o4
	ldx	[%o1 + 24], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	FOURTEEN_DROUNDS_2

	std	%f20, [%o2]
	std	%f22, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	ecbdec256_loop
	add	%o2, 32, %o2

	ldd	[%o0 + 0x80], %f20
	ldd	[%o0 + 0x88], %f22

ecbdec256_loop_end:

	retl
	nop

	SET_SIZE(t4_aes256_ecb_decrypt)

#else

	ENTRY(t4_aes128_ecb_decrypt)

	ldx	[%o0 + 0xa0], %g1	!ks[last-1]
	ldx	[%o0 + 0xa8], %g2	!ks[last]

ecbdec128_loop:
	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	TEN_DROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 0x8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ecbdec128_loop
	add	%o2, 16, %o2

	retl
	nop

	SET_SIZE(t4_aes128_ecb_decrypt)


	ENTRY(t4_aes192_ecb_decrypt)

	ldx	[%o0 + 0xc0], %g1	!ks[last-1]
	ldx	[%o0 + 0xc8], %g2	!ks[last]

ecbdec192_loop:
	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	TWELVE_DROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 0x8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ecbdec192_loop
	add	%o2, 16, %o2

	retl
	nop

	SET_SIZE(t4_aes192_ecb_decrypt)


	ENTRY(t4_aes256_ecb_decrypt)

	ldx	[%o0 + 0xe0], %g1	!ks[last-1]
	ldx	[%o0 + 0xe8], %g2	!ks[last]

ecbdec256_loop:
	ldx	[%o1], %o4
	ldx	[%o1 + 8], %o5
	xor	%g1, %o4, %g3	!initial ARK
	movxtod	%g3, %f60
	xor	%g2, %o5, %g3	!initial ARK
	movxtod	%g3, %f62

	FOURTEEN_DROUNDS

	std	%f60, [%o2]
	std	%f62, [%o2 + 0x8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	ecbdec256_loop
	add	%o2, 16, %o2

	retl
	nop

	SET_SIZE(t4_aes256_ecb_decrypt)

#endif

#define	TEST_PARALLEL_CBC_DECRYPT
#ifdef	EST_PARALLEL_CBC_DECRYPT
		ENTRY(t4_aes128_cbc_decrypt)

	save	%sp, -SA(MINFRAME), %sp
	ldx	[%i4], %o0		!IV
	ldx	[%i4 + 8], %o1		!IV
	ldx	[%i0 + 0xa0], %o2	!ks[last-1]
	ldx	[%i0 + 0xa8], %o3	!ks[last]
	and	%i3, 16, %o4
	brz	%o4, cbcdec128_loop
	nop

	ldx	[%i1], %o4
	ldx	[%i1 + 8], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	TEN_DROUNDS

	movxtod	%o0, %f56
	movxtod	%o1, %f58
	mov	%o4, %o0	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2]
	std	%f62, [%i2 + 0x8]

	add	%i1, 16, %i1
	subcc	%i3, 16, %i3
	be	cbcdec128_loop_end
	add	%i2, 16, %i2


cbcdec128_loop:
	ldx	[%i1], %g4
	ldx	[%i1 + 8], %g5
	xor	%o2, %g4, %g1	!initial ARK
	movxtod	%g1, %f0
	xor	%o3, %g5, %g1	!initial ARK
	movxtod	%g1, %f2

	ldx	[%i1 + 16], %o4
	ldx	[%i1 + 24], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	TEN_DROUNDS_2

	movxtod	%o0, %f6
	movxtod	%o1, %f4
	fxor	%f6, %f0, %f0	!add in previous IV
	fxor	%f4, %f2, %f2

	std	%f0, [%i2]
	std	%f2, [%i2 + 8]

	movxtod	%g4, %f56
	movxtod	%g5, %f58
	mov	%o4, %o0	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2 + 16]
	std	%f62, [%i2 + 24]

	add	%i1, 32, %i1
	subcc	%i3, 32, %i3
	bne	cbcdec128_loop
	add	%i2, 32, %i2

cbcdec128_loop_end:
	stx	%o0, [%i4]
	stx	%o1, [%i4 + 8]
	ret
	restore

	SET_SIZE(t4_aes128_cbc_decrypt)


	ENTRY(t4_aes192_cbc_decrypt)

	save	%sp, -SA(MINFRAME), %sp
	ldx	[%i4], %o0		!IV
	ldx	[%i4 + 8], %o1		!IV
	ldx	[%i0 + 0xc0], %o2	!ks[last-1]
	ldx	[%i0 + 0xc8], %o3	!ks[last]
	and	%i3, 16, %o4
	brz	%o4, cbcdec192_loop
	nop

	ldx	[%i1], %o4
	ldx	[%i1 + 8], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	TWELVE_DROUNDS

	movxtod	%o0, %f56
	movxtod	%o1, %f58
	mov	%o4, %o0	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2]
	std	%f62, [%i2 + 0x8]

	add	%i1, 16, %i1
	subcc	%i3, 16, %i3
	be	cbcdec192_loop_end
	add	%i2, 16, %i2


cbcdec192_loop:
	ldx	[%i1], %g4
	ldx	[%i1 + 8], %g5
	xor	%o2, %g4, %g1	!initial ARK
	movxtod	%g1, %f0
	xor	%o3, %g5, %g1	!initial ARK
	movxtod	%g1, %f2

	ldx	[%i1 + 16], %o4
	ldx	[%i1 + 24], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	TWELVE_DROUNDS_2

	movxtod	%o0, %f6
	movxtod	%o1, %f4
	fxor	%f6, %f0, %f0	!add in previous IV
	fxor	%f4, %f2, %f2

	std	%f0, [%i2]
	std	%f2, [%i2 + 8]

	movxtod	%g4, %f56
	movxtod	%g5, %f58
	mov	%o4, %o0	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2 + 16]
	std	%f62, [%i2 + 24]

	add	%i1, 32, %i1
	subcc	%i3, 32, %i3
	bne	cbcdec192_loop
	add	%i2, 32, %i2

cbcdec192_loop_end:
	stx	%o0, [%i4]
	stx	%o1, [%i4 + 8]
	ret
	restore

	SET_SIZE(t4_aes192_cbc_decrypt)


	ENTRY(t4_aes256_cbc_decrypt)

	save	%sp, -SA(MINFRAME), %sp
	mov	%i0, %o0		!FOURTEEN_DROUNDS uses %o0
	ldx	[%i4], %g2		!IV
	ldx	[%i4 + 8], %o1		!IV
	ldx	[%o0 + 0xe0], %o2	!ks[last-1]
	ldx	[%o0 + 0xe8], %o3	!ks[last]
	and	%i3, 16, %o4
	brz	%o4, cbcdec256_loop
	nop

	ldx	[%i1], %o4
	ldx	[%i1 + 8], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	FOURTEEN_DROUNDS

	movxtod	%g2, %f56
	movxtod	%o1, %f58
	mov	%o4, %g2	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2]
	std	%f62, [%i2 + 0x8]

	add	%i1, 16, %i1
	subcc	%i3, 16, %i3
	be	cbcdec256_loop_end
	add	%i2, 16, %i2


cbcdec256_loop:
	ldx	[%i1], %g4
	ldx	[%i1 + 8], %g5
	xor	%o2, %g4, %g1	!initial ARK
	movxtod	%g1, %f20
	xor	%o3, %g5, %g1	!initial ARK
	movxtod	%g1, %f22

	ldx	[%i1 + 16], %o4
	ldx	[%i1 + 24], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	FOURTEEN_DROUNDS_2

	movxtod	%g2, %f56
	movxtod	%o1, %f58
	fxor	%f56, %f20, %f20	!add in previous IV
	fxor	%f58, %f22, %f22

	std	%f20, [%i2]
	std	%f22, [%i2 + 8]

	movxtod	%g4, %f56
	movxtod	%g5, %f58
	mov	%o4, %g2	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2 + 16]
	std	%f62, [%i2 + 24]

	add	%i1, 32, %i1
	subcc	%i3, 32, %i3
	bne	cbcdec256_loop
	add	%i2, 32, %i2

	ldd	[%o0 + 0x80], %f20
	ldd	[%o0 + 0x88], %f22

cbcdec256_loop_end:
	stx	%g2, [%i4]
	stx	%o1, [%i4 + 8]
	ret
	restore

	SET_SIZE(t4_aes256_cbc_decrypt)

#else

	ENTRY(t4_aes128_cbc_decrypt)

	save	%sp, -SA(MINFRAME), %sp
	ldx	[%i4], %o0		!IV
	ldx	[%i4 + 8], %o1		!IV
	ldx	[%i0 + 0xa0], %o2	!ks[last-1]
	ldx	[%i0 + 0xa8], %o3	!ks[last]

cbcdec128_loop:
	ldx	[%i1], %o4
	ldx	[%i1 + 8], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	TEN_DROUNDS

	movxtod	%o0, %f56
	movxtod	%o1, %f58
	mov	%o4, %o0	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2]
	std	%f62, [%i2 + 0x8]

	add	%i1, 16, %i1
	subcc	%i3, 16, %i3
	bne	cbcdec128_loop
	add	%i2, 16, %i2

	stx	%o0, [%i4]
	stx	%o1, [%i4 + 8]
	ret
	restore

	SET_SIZE(t4_aes128_cbc_decrypt)


	ENTRY(t4_aes192_cbc_decrypt)

	save	%sp, -SA(MINFRAME), %sp
	ldx	[%i4], %o0		!IV
	ldx	[%i4 + 8], %o1		!IV
	ldx	[%i0 + 0xc0], %o2	!ks[last-1]
	ldx	[%i0 + 0xc8], %o3	!ks[last]

cbcdec192_loop:
	ldx	[%i1], %o4
	ldx	[%i1 + 8], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	TWELVE_DROUNDS

	movxtod	%o0, %f56
	movxtod	%o1, %f58
	mov	%o4, %o0	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2]
	std	%f62, [%i2 + 0x8]

	add	%i1, 16, %i1
	subcc	%i3, 16, %i3
	bne	cbcdec192_loop
	add	%i2, 16, %i2

	stx	%o0, [%i4]
	stx	%o1, [%i4 + 8]
	ret
	restore

	SET_SIZE(t4_aes192_cbc_decrypt)


	ENTRY(t4_aes256_cbc_decrypt)

	save	%sp, -SA(MINFRAME), %sp
	ldx	[%i4], %o0		!IV
	ldx	[%i4 + 8], %o1		!IV
	ldx	[%i0 + 0xe0], %o2	!ks[last-1]
	ldx	[%i0 + 0xe8], %o3	!ks[last]

cbcdec256_loop:
	ldx	[%i1], %o4
	ldx	[%i1 + 8], %o5
	xor	%o2, %o4, %g1	!initial ARK
	movxtod	%g1, %f60
	xor	%o3, %o5, %g1	!initial ARK
	movxtod	%g1, %f62

	FOURTEEN_DROUNDS

	movxtod	%o0, %f56
	movxtod	%o1, %f58
	mov	%o4, %o0	!save last block as next IV
	mov	%o5, %o1
	fxor	%f56, %f60, %f60	!add in previous IV
	fxor	%f58, %f62, %f62

	std	%f60, [%i2]
	std	%f62, [%i2 + 0x8]

	add	%i1, 16, %i1
	subcc	%i3, 16, %i3
	bne	cbcdec256_loop
	add	%i2, 16, %i2

	stx	%o0, [%i4]
	stx	%o1, [%i4 + 8]
	ret
	restore

	SET_SIZE(t4_aes256_cbc_decrypt)

#endif

#define	TEST_PARALLEL_CFB128_DECRYPT
#ifdef	TEST_PARALLEL_CFB128_DECRYPT

	ENTRY(t4_aes128_cfb128_decrypt)

	ldd	[%o4], %f56	!IV
	ldd	[%o4 + 8], %f58	!IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %o5
	brz	%o5, cfb128dec_128_loop

	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	TEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	cfb128dec_128_loop_end
	add	%o2, 16, %o2

cfb128dec_128_loop:
	ldd	[%o1], %f6	!input
	ldd	[%o1 + 8], %f4	!input
	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f6, %f0
	fxor	%f62, %f4, %f2
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	TEN_EROUNDS_2

	ldd	[%o1], %f6	!input
	ldd	[%o1 + 8], %f4	!input
	ldd	[%o1 + 16], %f56	!input
	ldd	[%o1 + 24], %f58	!input

	fxor	%f60, %f6, %f6
	fxor	%f62, %f4, %f4
	fxor	%f0, %f56, %f60
	fxor	%f2, %f58, %f62

	std	%f6, [%o2]
	std	%f4, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	cfb128dec_128_loop
	add	%o2, 32, %o2

cfb128dec_128_loop_end:
	std	%f56, [%o4]
	retl
	std	%f58, [%o4 + 8]

	SET_SIZE(t4_aes128_cfb128_decrypt)


	ENTRY(t4_aes192_cfb128_decrypt)

	ldd	[%o4], %f56	!IV
	ldd	[%o4 + 8], %f58	!IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %o5
	brz	%o5, cfb128dec_192_loop

	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	TWELVE_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	cfb128dec_192_loop_end
	add	%o2, 16, %o2

cfb128dec_192_loop:
	ldd	[%o1], %f6	!input
	ldd	[%o1 + 8], %f4	!input
	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f6, %f0
	fxor	%f62, %f4, %f2
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	TWELVE_EROUNDS_2

	ldd	[%o1], %f6	!input
	ldd	[%o1 + 8], %f4	!input
	ldd	[%o1 + 16], %f56	!input
	ldd	[%o1 + 24], %f58	!input

	fxor	%f60, %f6, %f6
	fxor	%f62, %f4, %f4
	fxor	%f0, %f56, %f60
	fxor	%f2, %f58, %f62

	std	%f6, [%o2]
	std	%f4, [%o2 + 8]
	std	%f60, [%o2 + 16]
	std	%f62, [%o2 + 24]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	cfb128dec_192_loop
	add	%o2, 32, %o2

cfb128dec_192_loop_end:
	std	%f56, [%o4]
	retl
	std	%f58, [%o4 + 8]

	SET_SIZE(t4_aes192_cfb128_decrypt)


	ENTRY(t4_aes256_cfb128_decrypt)

	ldd	[%o4], %f56	!IV
	ldd	[%o4 + 8], %f58	!IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]
	and	%o3, 16, %o5
	brz	%o5, cfb128dec_256_loop

	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	FOURTEEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	be	cfb128dec_256_loop_end
	add	%o2, 16, %o2

cfb128dec_256_loop:
	ldd	[%o1], %f20	!input
	ldd	[%o1 + 8], %f22	!input
	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f20, %f20
	fxor	%f62, %f22, %f22
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	FOURTEEN_EROUNDS_2

	ldd	[%o1 + 16], %f56	!input
	ldd	[%o1 + 24], %f58	!input
	fxor	%f20, %f56, %f20
	fxor	%f22, %f58, %f22
	std	%f20, [%o2 + 16]
	std	%f22, [%o2 + 24]

	ldd	[%o1], %f20	!input
	ldd	[%o1 + 8], %f22	!input

	fxor	%f60, %f20, %f20
	fxor	%f62, %f22, %f22

	std	%f20, [%o2]
	std	%f22, [%o2 + 8]

	add	%o1, 32, %o1
	subcc	%o3, 32, %o3
	bne	cfb128dec_256_loop
	add	%o2, 32, %o2

	ldd	[%o0 + 0x60], %f20
	ldd	[%o0 + 0x68], %f22

cfb128dec_256_loop_end:
	std	%f56, [%o4]
	retl
	std	%f58, [%o4 + 8]

	SET_SIZE(t4_aes256_cfb128_decrypt)

#else
	ENTRY(t4_aes128_cfb128_decrypt)

	ldd	[%o4], %f56	!IV
	ldd	[%o4 + 8], %f58	!IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cfb128dec_128_loop:
	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	TEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cfb128dec_128_loop
	add	%o2, 16, %o2

	std	%f56, [%o4]
	retl
	std	%f58, [%o4 + 8]

	SET_SIZE(t4_aes128_cfb128_decrypt)


	ENTRY(t4_aes192_cfb128_decrypt)

	ldd	[%o4], %f56	!IV
	ldd	[%o4 + 8], %f58	!IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cfb128dec_192_loop:
	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	TWELVE_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cfb128dec_192_loop
	add	%o2, 16, %o2

	std	%f56, [%o4]
	retl
	std	%f58, [%o4 + 8]

	SET_SIZE(t4_aes192_cfb128_decrypt)


	ENTRY(t4_aes256_cfb128_decrypt)

	ldd	[%o4], %f56	!IV
	ldd	[%o4 + 8], %f58	!IV
	ldx	[%o0], %g1	! ks[0]
	ldx	[%o0 + 8], %g2	! ks[1]

cfb128dec_256_loop:
	movxtod	%g1, %f60
	movxtod	%g2, %f62
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	/* CFB mode uses encryption for the decrypt operation */
	FOURTEEN_EROUNDS

	ldd	[%o1], %f56	!input
	ldd	[%o1 + 8], %f58	!input
	fxor	%f60, %f56, %f60
	fxor	%f62, %f58, %f62

	std	%f60, [%o2]
	std	%f62, [%o2 + 8]

	add	%o1, 16, %o1
	subcc	%o3, 16, %o3
	bne	cfb128dec_256_loop
	add	%o2, 16, %o2

	std	%f56, [%o4]
	retl
	std	%f58, [%o4 + 8]

	SET_SIZE(t4_aes256_cfb128_decrypt)

#endif

#endif	/* lint || __lint */