components/openssl/openssl-1.0.1/engines/t4/t4_aes.S
branchs11-update
changeset 2593 b92e6df5eaf0
parent 603 1b966e9a6b03
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/openssl/openssl-1.0.1/engines/t4/t4_aes.S	Fri May 03 16:10:11 2013 -0700
@@ -0,0 +1,3052 @@
+/*
+ * ====================================================================
+ * Copyright (c) 1998-2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    [email protected].
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*LINTLIBRARY*/
+
+#if defined(lint) || defined(__lint)
+
+
+#include <sys/types.h>
+
+/*ARGSUSED*/
+void t4_aes_expand128(uint64_t *rk, const uint32_t *key)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes_expand192(uint64_t *rk, const uint32_t *key)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes_expand256(uint64_t *rk, const uint32_t *key)
+{ return; }
+
+void t4_aes128_load_keys_for_encrypt(uint64_t *ks)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_load_keys_for_encrypt(uint64_t *ks)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_load_keys_for_encrypt(uint64_t *ks)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes128_ecb_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_ecb_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_ecb_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes128_cbc_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_cbc_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_cbc_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes128_ctr_crypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_ctr_crypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_ctr_crypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes128_cfb128_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_cfb128_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_cfb128_encrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+void t4_aes128_load_keys_for_decrypt(uint64_t *ks)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_load_keys_for_decrypt(uint64_t *ks)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_load_keys_for_decrypt(uint64_t *ks)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes128_ecb_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_ecb_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_ecb_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes128_cbc_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_cbc_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_cbc_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes128_cfb128_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes192_cfb128_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+/*ARGSUSED*/
+void t4_aes256_cfb128_decrypt(uint64_t *ks, uint64_t *asm_in,
+    uint64_t * asm_out, size_t amount_to_encrypt, uint64_t *iv)
+{ return; }
+
+#else	/* lint || __lint */
+
+#include<sys/asm_linkage.h>
+
+
+	ENTRY(t4_aes_expand128)
+
+!load key
+	ld	[%o1], %f0
+	ld	[%o1 + 0x4], %f1
+	ld	[%o1 + 0x8], %f2
+	ld	[%o1 + 0xc], %f3
+
+!expand the key
+	!aes_kexpand1 %f0, %f2, 0x0, %f4
+	!aes_kexpand2 %f2, %f4, %f6
+	!aes_kexpand1 %f4, %f6, 0x1, %f8
+	!aes_kexpand2 %f6, %f8, %f10
+	!aes_kexpand1 %f8, %f10, 0x2, %f12
+	!aes_kexpand2 %f10, %f12, %f14
+	!aes_kexpand1 %f12, %f14, 0x3, %f16
+	!aes_kexpand2 %f14, %f16, %f18
+	!aes_kexpand1 %f16, %f18, 0x4, %f20
+	!aes_kexpand2 %f18, %f20, %f22
+	!aes_kexpand1 %f20, %f22, 0x5, %f24
+	!aes_kexpand2 %f22, %f24, %f26
+	!aes_kexpand1 %f24, %f26, 0x6, %f28
+	!aes_kexpand2 %f26, %f28, %f30
+	!aes_kexpand1 %f28, %f30, 0x7, %f32
+	!aes_kexpand2 %f30, %f32, %f34
+	!aes_kexpand1 %f32, %f34, 0x8, %f36
+	!aes_kexpand2 %f34, %f36, %f38
+	!aes_kexpand1 %f36, %f38, 0x9, %f40
+	!aes_kexpand2 %f38, %f40, %f42
+	.byte	0x88, 0xc8, 0x01, 0x02
+	.byte	0x8d, 0xb0, 0xa6, 0x24
+	.byte	0x90, 0xc9, 0x03, 0x06
+	.byte	0x95, 0xb1, 0xa6, 0x28
+	.byte	0x98, 0xca, 0x05, 0x0a
+	.byte	0x9d, 0xb2, 0xa6, 0x2c
+	.byte	0xa0, 0xcb, 0x07, 0x0e
+	.byte	0xa5, 0xb3, 0xa6, 0x30
+	.byte	0xa8, 0xcc, 0x09, 0x12
+	.byte	0xad, 0xb4, 0xa6, 0x34
+	.byte	0xb0, 0xcd, 0x0b, 0x16
+	.byte	0xb5, 0xb5, 0xa6, 0x38
+	.byte	0xb8, 0xce, 0x0d, 0x1a
+	.byte	0xbd, 0xb6, 0xa6, 0x3c
+	.byte	0x82, 0xcf, 0x0f, 0x1e
+	.byte	0x87, 0xb7, 0xa6, 0x21
+	.byte	0x8a, 0xc8, 0x51, 0x03
+	.byte	0x8f, 0xb0, 0xe6, 0x25
+	.byte	0x92, 0xc9, 0x53, 0x07
+	.byte	0x97, 0xb1, 0xe6, 0x29
+
+!copy expanded key back into array
+	std	%f4, [%o0]
+	std	%f6, [%o0 + 0x8]
+	std	%f8, [%o0 + 0x10]
+	std	%f10, [%o0 + 0x18]
+	std	%f12, [%o0 + 0x20]
+	std	%f14, [%o0 + 0x28]
+	std	%f16, [%o0 + 0x30]
+	std	%f18, [%o0 + 0x38]
+	std	%f20, [%o0 + 0x40]
+	std	%f22, [%o0 + 0x48]
+	std	%f24, [%o0 + 0x50]
+	std	%f26, [%o0 + 0x58]
+	std	%f28, [%o0 + 0x60]
+	std	%f30, [%o0 + 0x68]
+	std	%f32, [%o0 + 0x70]
+	std	%f34, [%o0 + 0x78]
+	std	%f36, [%o0 + 0x80]
+	std	%f38, [%o0 + 0x88]
+	std	%f40, [%o0 + 0x90]
+	retl
+	std	%f42, [%o0 + 0x98]
+
+	SET_SIZE(t4_aes_expand128)
+
+
+	ENTRY(t4_aes_expand192)
+
+!load key
+	ld	[%o1], %f0
+	ld	[%o1 + 0x4], %f1
+	ld	[%o1 + 0x8], %f2
+	ld	[%o1 + 0xc], %f3
+	ld	[%o1 + 0x10], %f4
+	ld	[%o1 + 0x14], %f5
+
+!expand the key
+	!aes_kexpand1 %f0, %f4, 0x0, %f6
+	!aes_kexpand2 %f2, %f6, %f8
+	!aes_kexpand2 %f4, %f8, %f10
+
+	!aes_kexpand1 %f6, %f10, 0x1, %f12
+	!aes_kexpand2 %f8, %f12, %f14
+	!aes_kexpand2 %f10, %f14, %f16
+
+	!aes_kexpand1 %f12, %f16, 0x2, %f18
+	!aes_kexpand2 %f14, %f18, %f20
+	!aes_kexpand2 %f16, %f20, %f22
+
+	!aes_kexpand1 %f18, %f22, 0x3, %f24
+	!aes_kexpand2 %f20, %f24, %f26
+	!aes_kexpand2 %f22, %f26, %f28
+
+	!aes_kexpand1 %f24, %f28, 0x4, %f30
+	!aes_kexpand2 %f26, %f30, %f32
+	!aes_kexpand2 %f28, %f32, %f34
+
+	!aes_kexpand1 %f30, %f34, 0x5, %f36
+	!aes_kexpand2 %f32, %f36, %f38
+	!aes_kexpand2 %f34, %f38, %f40
+
+	!aes_kexpand1 %f36, %f40, 0x6, %f42
+	!aes_kexpand2 %f38, %f42, %f44
+	!aes_kexpand2 %f40, %f44, %f46
+
+	!aes_kexpand1 %f42, %f46, 0x7, %f48
+	!aes_kexpand2 %f44, %f48, %f50
+	.byte	0x8c, 0xc8, 0x01, 0x04
+	.byte	0x91, 0xb0, 0xa6, 0x26
+	.byte	0x95, 0xb1, 0x26, 0x28
+	.byte	0x98, 0xc9, 0x83, 0x0a
+	.byte	0x9d, 0xb2, 0x26, 0x2c
+	.byte	0xa1, 0xb2, 0xa6, 0x2e
+	.byte	0xa4, 0xcb, 0x05, 0x10
+	.byte	0xa9, 0xb3, 0xa6, 0x32
+	.byte	0xad, 0xb4, 0x26, 0x34
+	.byte	0xb0, 0xcc, 0x87, 0x16
+	.byte	0xb5, 0xb5, 0x26, 0x38
+	.byte	0xb9, 0xb5, 0xa6, 0x3a
+	.byte	0xbc, 0xce, 0x09, 0x1c
+	.byte	0x83, 0xb6, 0xa6, 0x3e
+	.byte	0x87, 0xb7, 0x26, 0x21
+	.byte	0x8a, 0xcf, 0x8b, 0x03
+	.byte	0x8f, 0xb0, 0x66, 0x25
+	.byte	0x93, 0xb0, 0xe6, 0x27
+	.byte	0x96, 0xc9, 0x4d, 0x09
+	.byte	0x9b, 0xb1, 0xe6, 0x2b
+	.byte	0x9f, 0xb2, 0x66, 0x2d
+	.byte	0xa2, 0xca, 0xcf, 0x0f
+	.byte	0xa7, 0xb3, 0x66, 0x31
+
+!copy expanded key back into array
+	std	%f6, [%o0]
+	std	%f8, [%o0 + 0x8]
+	std	%f10, [%o0 + 0x10]
+	std	%f12, [%o0 + 0x18]
+	std	%f14, [%o0 + 0x20]
+	std	%f16, [%o0 + 0x28]
+	std	%f18, [%o0 + 0x30]
+	std	%f20, [%o0 + 0x38]
+	std	%f22, [%o0 + 0x40]
+	std	%f24, [%o0 + 0x48]
+	std	%f26, [%o0 + 0x50]
+	std	%f28, [%o0 + 0x58]
+	std	%f30, [%o0 + 0x60]
+	std	%f32, [%o0 + 0x68]
+	std	%f34, [%o0 + 0x70]
+	std	%f36, [%o0 + 0x78]
+	std	%f38, [%o0 + 0x80]
+	std	%f40, [%o0 + 0x88]
+	std	%f42, [%o0 + 0x90]
+	std	%f44, [%o0 + 0x98]
+	std	%f46, [%o0 + 0xa0]
+	std	%f48, [%o0 + 0xa8]
+	retl
+	std	%f50, [%o0 + 0xb0]
+
+	SET_SIZE(t4_aes_expand192)
+
+
+	ENTRY(t4_aes_expand256)
+
+!load key
+	ld	[%o1], %f0
+	ld	[%o1 + 0x4], %f1
+	ld	[%o1 + 0x8], %f2
+	ld	[%o1 + 0xc], %f3
+	ld	[%o1 + 0x10], %f4
+	ld	[%o1 + 0x14], %f5
+	ld	[%o1 + 0x18], %f6
+	ld	[%o1 + 0x1c], %f7
+
+!expand the key
+	!aes_kexpand1 %f0, %f6, 0x0, %f8
+	!aes_kexpand2 %f2, %f8, %f10
+	!aes_kexpand0 %f4, %f10, %f12
+	!aes_kexpand2 %f6, %f12, %f14
+
+	!aes_kexpand1 %f8, %f14, 0x1, %f16
+	!aes_kexpand2 %f10, %f16, %f18
+	!aes_kexpand0 %f12, %f18, %f20
+	!aes_kexpand2 %f14, %f20, %f22
+
+	!aes_kexpand1 %f16, %f22, 0x2, %f24
+	!aes_kexpand2 %f18, %f24, %f26
+	!aes_kexpand0 %f20, %f26, %f28
+	!aes_kexpand2 %f22, %f28, %f30
+
+	!aes_kexpand1 %f24, %f30, 0x3, %f32
+	!aes_kexpand2 %f26, %f32, %f34
+	!aes_kexpand0 %f28, %f34, %f36
+	!aes_kexpand2 %f30, %f36, %f38
+
+	!aes_kexpand1 %f32, %f38, 0x4, %f40
+	!aes_kexpand2 %f34, %f40, %f42
+	!aes_kexpand0 %f36, %f42, %f44
+	!aes_kexpand2 %f38, %f44, %f46
+
+	!aes_kexpand1 %f40, %f46, 0x5, %f48
+	!aes_kexpand2 %f42, %f48, %f50
+	!aes_kexpand0 %f44, %f50, %f52
+	!aes_kexpand2 %f46, %f52, %f54
+
+	!aes_kexpand1 %f48, %f54, 0x6, %f56
+	!aes_kexpand2 %f50, %f56, %f58
+	.byte	0x90, 0xc8, 0x01, 0x06
+	.byte	0x95, 0xb0, 0xa6, 0x28
+	.byte	0x99, 0xb1, 0x26, 0x0a
+	.byte	0x9d, 0xb1, 0xa6, 0x2c
+	.byte	0xa0, 0xca, 0x03, 0x0e
+	.byte	0xa5, 0xb2, 0xa6, 0x30
+	.byte	0xa9, 0xb3, 0x26, 0x12
+	.byte	0xad, 0xb3, 0xa6, 0x34
+	.byte	0xb0, 0xcc, 0x05, 0x16
+	.byte	0xb5, 0xb4, 0xa6, 0x38
+	.byte	0xb9, 0xb5, 0x26, 0x1a
+	.byte	0xbd, 0xb5, 0xa6, 0x3c
+	.byte	0x82, 0xce, 0x07, 0x1e
+	.byte	0x87, 0xb6, 0xa6, 0x21
+	.byte	0x8b, 0xb7, 0x26, 0x03
+	.byte	0x8f, 0xb7, 0xa6, 0x25
+	.byte	0x92, 0xc8, 0x49, 0x07
+	.byte	0x97, 0xb0, 0xe6, 0x29
+	.byte	0x9b, 0xb1, 0x66, 0x0b
+	.byte	0x9f, 0xb1, 0xe6, 0x2d
+	.byte	0xa2, 0xca, 0x4b, 0x0f
+	.byte	0xa7, 0xb2, 0xe6, 0x31
+	.byte	0xab, 0xb3, 0x66, 0x13
+	.byte	0xaf, 0xb3, 0xe6, 0x35
+	.byte	0xb2, 0xcc, 0x4d, 0x17
+	.byte	0xb7, 0xb4, 0xe6, 0x39
+
+!copy expanded key back into array
+	std	%f8, [%o0]
+	std	%f10, [%o0 + 0x8]
+	std	%f12, [%o0 + 0x10]
+	std	%f14, [%o0 + 0x18]
+	std	%f16, [%o0 + 0x20]
+	std	%f18, [%o0 + 0x28]
+	std	%f20, [%o0 + 0x30]
+	std	%f22, [%o0 + 0x38]
+	std	%f24, [%o0 + 0x40]
+	std	%f26, [%o0 + 0x48]
+	std	%f28, [%o0 + 0x50]
+	std	%f30, [%o0 + 0x58]
+	std	%f32, [%o0 + 0x60]
+	std	%f34, [%o0 + 0x68]
+	std	%f36, [%o0 + 0x70]
+	std	%f38, [%o0 + 0x78]
+	std	%f40, [%o0 + 0x80]
+	std	%f42, [%o0 + 0x88]
+	std	%f44, [%o0 + 0x90]
+	std	%f46, [%o0 + 0x98]
+	std	%f48, [%o0 + 0xa0]
+	std	%f50, [%o0 + 0xa8]
+	std	%f52, [%o0 + 0xb0]
+	std	%f54, [%o0 + 0xb8]
+	std	%f56, [%o0 + 0xc0]
+	retl
+	std	%f58, [%o0 + 0xc8]
+
+	SET_SIZE(t4_aes_expand256)
+
+
+#define	FIRST_TWO_EROUNDS \
+	.byte	0xb2, 0xc8, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xc8, 0xbe, 0x3d ; \
+	.byte	0xba, 0xc9, 0x36, 0x19 ; \
+	.byte	0xbe, 0xc9, 0xb6, 0x39
+	!aes_eround01	%f0, %f60, %f62, %f56 ; \
+	!aes_eround23	%f2, %f60, %f62, %f58 ; \
+	!aes_eround01	%f4, %f56, %f58, %f60 ; \
+	!aes_eround23	%f6, %f56, %f58, %f62
+
+#define	MID_TWO_EROUNDS \
+	.byte	0xb2, 0xca, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xca, 0xbe, 0x3d ; \
+	.byte	0xba, 0xcb, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcb, 0xb6, 0x39
+	!aes_eround01	%f8, %f60, %f62, %f56 ; \
+	!aes_eround23	%f10, %f60, %f62, %f58 ; \
+	!aes_eround01	%f12, %f56, %f58, %f60 ; \
+	!aes_eround23	%f14, %f56, %f58, %f62
+
+#define	MID_TWO_EROUNDS_2 \
+	.byte	0x8c, 0xca, 0x04, 0x00 ; \
+	.byte	0x88, 0xca, 0x84, 0x20 ; \
+	.byte	0xb2, 0xca, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xca, 0xbe, 0x3d ; \
+	.byte	0x80, 0xcb, 0x08, 0x06 ; \
+	.byte	0x84, 0xcb, 0x88, 0x26 ; \
+	.byte	0xba, 0xcb, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcb, 0xb6, 0x39
+	!aes_eround01	%f8, %f0, %f2, %f6 ; \
+	!aes_eround23	%f10, %f0, %f2, %f4 ; \
+	!aes_eround01	%f8, %f60, %f62, %f56 ; \
+	!aes_eround23	%f10, %f60, %f62, %f58 ; \
+	!aes_eround01	%f12, %f6, %f4, %f0 ; \
+	!aes_eround23	%f14, %f6, %f4, %f2 ; \
+	!aes_eround01	%f12, %f56, %f58, %f60 ; \
+	!aes_eround23	%f14, %f56, %f58, %f62
+
+#define	TEN_EROUNDS \
+	.byte	0xb2, 0xcc, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xcc, 0xbe, 0x3d ; \
+	.byte	0xba, 0xcd, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcd, 0xb6, 0x39 ; \
+	.byte	0xb2, 0xce, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xce, 0xbe, 0x3d ; \
+	.byte	0xba, 0xcf, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcf, 0xb6, 0x39 ; \
+	.byte	0xb2, 0xc8, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xc8, 0xfe, 0x3d ; \
+	.byte	0xba, 0xc9, 0x76, 0x19 ; \
+	.byte	0xbe, 0xc9, 0xf6, 0x39 ; \
+	.byte	0xb2, 0xca, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xca, 0xfe, 0x3d ; \
+	.byte	0xba, 0xcb, 0x76, 0x19 ; \
+	.byte	0xbe, 0xcb, 0xf6, 0x39 ; \
+	.byte	0xb2, 0xcc, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xcc, 0xfe, 0x3d ; \
+	.byte	0xba, 0xcd, 0x76, 0x99 ; \
+	.byte	0xbe, 0xcd, 0xf6, 0xb9
+	!aes_eround01	%f16, %f60, %f62, %f56 ; \
+	!aes_eround23	%f18, %f60, %f62, %f58 ; \
+	!aes_eround01	%f20, %f56, %f58, %f60 ; \
+	!aes_eround23	%f22, %f56, %f58, %f62 ; \
+	!aes_eround01	%f24, %f60, %f62, %f56 ; \
+	!aes_eround23	%f26, %f60, %f62, %f58 ; \
+	!aes_eround01	%f28, %f56, %f58, %f60 ; \
+	!aes_eround23	%f30, %f56, %f58, %f62 ; \
+	!aes_eround01	%f32, %f60, %f62, %f56 ; \
+	!aes_eround23	%f34, %f60, %f62, %f58 ; \
+	!aes_eround01	%f36, %f56, %f58, %f60 ; \
+	!aes_eround23	%f38, %f56, %f58, %f62 ; \
+	!aes_eround01	%f40, %f60, %f62, %f56 ; \
+	!aes_eround23	%f42, %f60, %f62, %f58 ; \
+	!aes_eround01	%f44, %f56, %f58, %f60 ; \
+	!aes_eround23	%f46, %f56, %f58, %f62 ; \
+	!aes_eround01	%f48, %f60, %f62, %f56 ; \
+	!aes_eround23	%f50, %f60, %f62, %f58 ; \
+	!aes_eround01_l	%f52, %f56, %f58, %f60 ; \
+	!aes_eround23_l	%f54, %f56, %f58, %f62
+
+#define	TEN_EROUNDS_2 \
+	.byte	0x8c, 0xcc, 0x04, 0x00 ; \
+	.byte	0x88, 0xcc, 0x84, 0x20 ; \
+	.byte	0xb2, 0xcc, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xcc, 0xbe, 0x3d ; \
+	.byte	0x80, 0xcd, 0x08, 0x06 ; \
+	.byte	0x84, 0xcd, 0x88, 0x26 ; \
+	.byte	0xba, 0xcd, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcd, 0xb6, 0x39 ; \
+	.byte	0x8c, 0xce, 0x04, 0x00 ; \
+	.byte	0x88, 0xce, 0x84, 0x20 ; \
+	.byte	0xb2, 0xce, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xce, 0xbe, 0x3d ; \
+	.byte	0x80, 0xcf, 0x08, 0x06 ; \
+	.byte	0x84, 0xcf, 0x88, 0x26 ; \
+	.byte	0xba, 0xcf, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcf, 0xb6, 0x39 ; \
+	.byte	0x8c, 0xc8, 0x44, 0x00 ; \
+	.byte	0x88, 0xc8, 0xc4, 0x20 ; \
+	.byte	0xb2, 0xc8, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xc8, 0xfe, 0x3d ; \
+	.byte	0x80, 0xc9, 0x48, 0x06 ; \
+	.byte	0x84, 0xc9, 0xc8, 0x26 ; \
+	.byte	0xba, 0xc9, 0x76, 0x19 ; \
+	.byte	0xbe, 0xc9, 0xf6, 0x39 ; \
+	.byte	0x8c, 0xca, 0x44, 0x00 ; \
+	.byte	0x88, 0xca, 0xc4, 0x20 ; \
+	.byte	0xb2, 0xca, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xca, 0xfe, 0x3d ; \
+	.byte	0x80, 0xcb, 0x48, 0x06 ; \
+	.byte	0x84, 0xcb, 0xc8, 0x26 ; \
+	.byte	0xba, 0xcb, 0x76, 0x19 ; \
+	.byte	0xbe, 0xcb, 0xf6, 0x39 ; \
+	.byte	0x8c, 0xcc, 0x44, 0x00 ; \
+	.byte	0x88, 0xcc, 0xc4, 0x20 ; \
+	.byte	0xb2, 0xcc, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xcc, 0xfe, 0x3d ; \
+	.byte	0x80, 0xcd, 0x48, 0x86 ; \
+	.byte	0x84, 0xcd, 0xc8, 0xa6 ; \
+	.byte	0xba, 0xcd, 0x76, 0x99 ; \
+	.byte	0xbe, 0xcd, 0xf6, 0xb9
+	!aes_eround01	%f16, %f0, %f2, %f6 ; \
+	!aes_eround23	%f18, %f0, %f2, %f4 ; \
+	!aes_eround01	%f16, %f60, %f62, %f56 ; \
+	!aes_eround23	%f18, %f60, %f62, %f58 ; \
+	!aes_eround01	%f20, %f6, %f4, %f0 ; \
+	!aes_eround23	%f22, %f6, %f4, %f2 ; \
+	!aes_eround01	%f20, %f56, %f58, %f60 ; \
+	!aes_eround23	%f22, %f56, %f58, %f62 ; \
+	!aes_eround01	%f24, %f0, %f2, %f6 ; \
+	!aes_eround23	%f26, %f0, %f2, %f4 ; \
+	!aes_eround01	%f24, %f60, %f62, %f56 ; \
+	!aes_eround23	%f26, %f60, %f62, %f58 ; \
+	!aes_eround01	%f28, %f6, %f4, %f0 ; \
+	!aes_eround23	%f30, %f6, %f4, %f2 ; \
+	!aes_eround01	%f28, %f56, %f58, %f60 ; \
+	!aes_eround23	%f30, %f56, %f58, %f62 ; \
+	!aes_eround01	%f32, %f0, %f2, %f6 ; \
+	!aes_eround23	%f34, %f0, %f2, %f4 ; \
+	!aes_eround01	%f32, %f60, %f62, %f56 ; \
+	!aes_eround23	%f34, %f60, %f62, %f58 ; \
+	!aes_eround01	%f36, %f6, %f4, %f0 ; \
+	!aes_eround23	%f38, %f6, %f4, %f2 ; \
+	!aes_eround01	%f36, %f56, %f58, %f60 ; \
+	!aes_eround23	%f38, %f56, %f58, %f62 ; \
+	!aes_eround01	%f40, %f0, %f2, %f6 ; \
+	!aes_eround23	%f42, %f0, %f2, %f4 ; \
+	!aes_eround01	%f40, %f60, %f62, %f56 ; \
+	!aes_eround23	%f42, %f60, %f62, %f58 ; \
+	!aes_eround01	%f44, %f6, %f4, %f0 ; \
+	!aes_eround23	%f46, %f6, %f4, %f2 ; \
+	!aes_eround01	%f44, %f56, %f58, %f60 ; \
+	!aes_eround23	%f46, %f56, %f58, %f62 ; \
+	!aes_eround01	%f48, %f0, %f2, %f6 ; \
+	!aes_eround23	%f50, %f0, %f2, %f4 ; \
+	!aes_eround01	%f48, %f60, %f62, %f56 ; \
+	!aes_eround23	%f50, %f60, %f62, %f58 ; \
+	!aes_eround01_l	%f52, %f6, %f4, %f0 ; \
+	!aes_eround23_l	%f54, %f6, %f4, %f2 ; \
+	!aes_eround01_l	%f52, %f56, %f58, %f60 ; \
+	!aes_eround23_l	%f54, %f56, %f58, %f62
+
+#define	TWELVE_EROUNDS \
+	MID_TWO_EROUNDS	; \
+	TEN_EROUNDS
+
+#define	TWELVE_EROUNDS_2 \
+	MID_TWO_EROUNDS_2	; \
+	TEN_EROUNDS_2
+
+#define	FOURTEEN_EROUNDS \
+	FIRST_TWO_EROUNDS ; \
+	TWELVE_EROUNDS
+
+#define	FOURTEEN_EROUNDS_2 \
+	.byte	0xb0, 0xc8, 0x2c, 0x14 ; \
+	.byte	0xac, 0xc8, 0xac, 0x34 ; \
+	ldd	[%o0 + 0x60], %f20 ; \
+	.byte	0xb2, 0xc8, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xc8, 0xbe, 0x3d ; \
+	.byte	0x80, 0xc9, 0x2c, 0x18 ; \
+	.byte	0x84, 0xc9, 0xac, 0x38 ;\
+	ldd	[%o0 + 0x68], %f22 ; \
+	.byte	0xba, 0xc9, 0x36, 0x19 ; \
+	ldd	[%o0 + 0x70], %f24 ; \
+	.byte	0xbe, 0xc9, 0xb6, 0x39 ; \
+	.byte	0x8c, 0xca, 0x04, 0x00 ; \
+	.byte	0x88, 0xca, 0x84, 0x20 ; \
+	.byte	0xb2, 0xca, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xca, 0xbe, 0x3d ; \
+	.byte	0x80, 0xcb, 0x08, 0x06 ; \
+	.byte	0x84, 0xcb, 0x88, 0x26 ; \
+	.byte	0xba, 0xcb, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcb, 0xb6, 0x39 ; \
+	.byte	0x8c, 0xcc, 0x04, 0x00 ; \
+	.byte	0x88, 0xcc, 0x84, 0x20 ; \
+	.byte	0xb2, 0xcc, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xcc, 0xbe, 0x3d ; \
+	.byte	0x80, 0xcd, 0x08, 0x06 ; \
+	.byte	0x84, 0xcd, 0x88, 0x26 ; \
+	.byte	0xba, 0xcd, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcd, 0xb6, 0x39 ; \
+	.byte	0x8c, 0xce, 0x04, 0x00 ; \
+	.byte	0x88, 0xce, 0x84, 0x20 ; \
+	.byte	0xb2, 0xce, 0x3e, 0x1d ; \
+	.byte	0xb6, 0xce, 0xbe, 0x3d ; \
+	.byte	0x80, 0xcf, 0x08, 0x06 ; \
+	.byte	0x84, 0xcf, 0x88, 0x26 ; \
+	.byte	0xba, 0xcf, 0x36, 0x19 ; \
+	.byte	0xbe, 0xcf, 0xb6, 0x39 ; \
+	.byte	0x8c, 0xc8, 0x44, 0x00 ; \
+	.byte	0x88, 0xc8, 0xc4, 0x20 ; \
+	.byte	0xb2, 0xc8, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xc8, 0xfe, 0x3d ; \
+	.byte	0x80, 0xc9, 0x48, 0x06 ; \
+	.byte	0x84, 0xc9, 0xc8, 0x26 ; \
+	.byte	0xba, 0xc9, 0x76, 0x19 ; \
+	.byte	0xbe, 0xc9, 0xf6, 0x39 ; \
+	.byte	0x8c, 0xca, 0x44, 0x00 ; \
+	.byte	0x88, 0xca, 0xc4, 0x20 ; \
+	.byte	0xb2, 0xca, 0x7e, 0x1d ; \
+	.byte	0xb6, 0xca, 0xfe, 0x3d ; \
+	.byte	0x80, 0xcb, 0x48, 0x06 ; \
+	.byte	0x84, 0xcb, 0xc8, 0x26 ; \
+	.byte	0xba, 0xcb, 0x76, 0x19 ; \
+	.byte	0xbe, 0xcb, 0xf6, 0x39 ; \
+	.byte	0x8c, 0xcc, 0x44, 0x00 ; \
+	.byte	0x88, 0xcc, 0xc4, 0x20 ; \
+	ldd	[%o0 + 0x10], %f0 ; \
+	.byte	0xb2, 0xcc, 0x7e, 0x1d ; \
+	ldd	[%o0 + 0x18], %f2 ; \
+	.byte	0xb6, 0xcc, 0xfe, 0x3d ; \
+	.byte	0xa8, 0xcd, 0x48, 0x86 ; \
+	.byte	0xac, 0xcd, 0xc8, 0xa6 ; \
+	ldd	[%o0 + 0x20], %f4 ; \
+	.byte	0xba, 0xcd, 0x76, 0x99 ; \
+	ldd	[%o0 + 0x28], %f6 ; \
+	.byte	0xbe, 0xcd, 0xf6, 0xb9
+	!aes_eround01	%f0, %f20, %f22, %f24 ; \
+	!aes_eround23	%f2, %f20, %f22, %f22 ; \
+	!ldd	[%o0 + 0x60], %f20 ; \
+	!aes_eround01	%f0, %f60, %f62, %f56 ; \
+	!aes_eround23	%f2, %f60, %f62, %f58 ; \
+	!aes_eround01	%f4, %f24, %f22, %f0 ; \
+	!aes_eround23	%f6, %f24, %f22, %f2 ; \
+	!ldd	[%o0 + 0x68], %f22 ; \
+	!aes_eround01	%f4, %f56, %f58, %f60 ; \
+	!ldd	[%o0 + 0x70], %f24 ; \
+	!aes_eround23	%f6, %f56, %f58, %f62 ; \
+	!aes_eround01	%f8, %f0, %f2, %f6 ; \
+	!aes_eround23	%f10, %f0, %f2, %f4 ; \
+	!aes_eround01	%f8, %f60, %f62, %f56 ; \
+	!aes_eround23	%f10, %f60, %f62, %f58 ; \
+	!aes_eround01	%f12, %f6, %f4, %f0 ; \
+	!aes_eround23	%f14, %f6, %f4, %f2 ; \
+	!aes_eround01	%f12, %f56, %f58, %f60 ; \
+	!aes_eround23	%f14, %f56, %f58, %f62 ; \
+	!aes_eround01	%f16, %f0, %f2, %f6 ; \
+	!aes_eround23	%f18, %f0, %f2, %f4 ; \
+	!aes_eround01	%f16, %f60, %f62, %f56 ; \
+	!aes_eround23	%f18, %f60, %f62, %f58 ; \
+	!aes_eround01	%f20, %f6, %f4, %f0 ; \
+	!aes_eround23	%f22, %f6, %f4, %f2 ; \
+	!aes_eround01	%f20, %f56, %f58, %f60 ; \
+	!aes_eround23	%f22, %f56, %f58, %f62 ; \
+	!aes_eround01	%f24, %f0, %f2, %f6 ; \
+	!aes_eround23	%f26, %f0, %f2, %f4 ; \
+	!aes_eround01	%f24, %f60, %f62, %f56 ; \
+	!aes_eround23	%f26, %f60, %f62, %f58 ; \
+	!aes_eround01	%f28, %f6, %f4, %f0 ; \
+	!aes_eround23	%f30, %f6, %f4, %f2 ; \
+	!aes_eround01	%f28, %f56, %f58, %f60 ; \
+	!aes_eround23	%f30, %f56, %f58, %f62 ; \
+	!aes_eround01	%f32, %f0, %f2, %f6 ; \
+	!aes_eround23	%f34, %f0, %f2, %f4 ; \
+	!aes_eround01	%f32, %f60, %f62, %f56 ; \
+	!aes_eround23	%f34, %f60, %f62, %f58 ; \
+	!aes_eround01	%f36, %f6, %f4, %f0 ; \
+	!aes_eround23	%f38, %f6, %f4, %f2 ; \
+	!aes_eround01	%f36, %f56, %f58, %f60 ; \
+	!aes_eround23	%f38, %f56, %f58, %f62 ; \
+	!aes_eround01	%f40, %f0, %f2, %f6 ; \
+	!aes_eround23	%f42, %f0, %f2, %f4 ; \
+	!aes_eround01	%f40, %f60, %f62, %f56 ; \
+	!aes_eround23	%f42, %f60, %f62, %f58 ; \
+	!aes_eround01	%f44, %f6, %f4, %f0 ; \
+	!aes_eround23	%f46, %f6, %f4, %f2 ; \
+	!aes_eround01	%f44, %f56, %f58, %f60 ; \
+	!aes_eround23	%f46, %f56, %f58, %f62 ; \
+	!aes_eround01	%f48, %f0, %f2, %f6 ; \
+	!aes_eround23	%f50, %f0, %f2, %f4 ; \
+	!ldd	[%o0 + 0x10], %f0 ; \
+	!aes_eround01	%f48, %f60, %f62, %f56 ; \
+	!ldd	[%o0 + 0x18], %f2 ; \
+	!aes_eround23	%f50, %f60, %f62, %f58 ; \
+	!aes_eround01_l	%f52, %f6, %f4, %f20 ; \
+	!aes_eround23_l	%f54, %f6, %f4, %f22 ; \
+	!ldd	[%o0 + 0x20], %f4 ; \
+	!aes_eround01_l	%f52, %f56, %f58, %f60 ; \
+	!ldd	[%o0 + 0x28], %f6 ; \
+	!aes_eround23_l	%f54, %f56, %f58, %f62
+
+#define	FIRST_TWO_DROUNDS \
+	.byte	0xb2, 0xc8, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xc8, 0xbe, 0x7d ; \
+	.byte	0xba, 0xc9, 0x36, 0x59 ; \
+	.byte	0xbe, 0xc9, 0xb6, 0x79
+	!aes_dround01	%f0, %f60, %f62, %f56 ; \
+	!aes_dround23	%f2, %f60, %f62, %f58 ; \
+	!aes_dround01	%f4, %f56, %f58, %f60 ; \
+	!aes_dround23	%f6, %f56, %f58, %f62
+
+#define	MID_TWO_DROUNDS \
+	.byte	0xb2, 0xca, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xca, 0xbe, 0x7d ; \
+	.byte	0xba, 0xcb, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcb, 0xb6, 0x79
+	!aes_dround01	%f8, %f60, %f62, %f56 ; \
+	!aes_dround23	%f10, %f60, %f62, %f58 ; \
+	!aes_dround01	%f12, %f56, %f58, %f60 ; \
+	!aes_dround23	%f14, %f56, %f58, %f62
+
+#define	MID_TWO_DROUNDS_2 \
+	.byte	0x8c, 0xca, 0x04, 0x40 ; \
+	.byte	0x88, 0xca, 0x84, 0x60 ; \
+	.byte	0xb2, 0xca, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xca, 0xbe, 0x7d ; \
+	.byte	0x80, 0xcb, 0x08, 0x46 ; \
+	.byte	0x84, 0xcb, 0x88, 0x66 ; \
+	.byte	0xba, 0xcb, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcb, 0xb6, 0x79
+	!aes_dround01	%f8, %f0, %f2, %f6 ; \
+	!aes_dround23	%f10, %f0, %f2, %f4 ; \
+	!aes_dround01	%f8, %f60, %f62, %f56 ; \
+	!aes_dround23	%f10, %f60, %f62, %f58 ; \
+	!aes_dround01	%f12, %f6, %f4, %f0 ; \
+	!aes_dround23	%f14, %f6, %f4, %f2 ; \
+	!aes_dround01	%f12, %f56, %f58, %f60 ; \
+	!aes_dround23	%f14, %f56, %f58, %f62
+
+#define	TEN_DROUNDS \
+	.byte	0xb2, 0xcc, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xcc, 0xbe, 0x7d ; \
+	.byte	0xba, 0xcd, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcd, 0xb6, 0x79 ; \
+	.byte	0xb2, 0xce, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xce, 0xbe, 0x7d ; \
+	.byte	0xba, 0xcf, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcf, 0xb6, 0x79 ; \
+	.byte	0xb2, 0xc8, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xc8, 0xfe, 0x7d ; \
+	.byte	0xba, 0xc9, 0x76, 0x59 ; \
+	.byte	0xbe, 0xc9, 0xf6, 0x79 ; \
+	.byte	0xb2, 0xca, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xca, 0xfe, 0x7d ; \
+	.byte	0xba, 0xcb, 0x76, 0x59 ; \
+	.byte	0xbe, 0xcb, 0xf6, 0x79 ; \
+	.byte	0xb2, 0xcc, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xcc, 0xfe, 0x7d ; \
+	.byte	0xba, 0xcd, 0x76, 0xd9 ; \
+	.byte	0xbe, 0xcd, 0xf6, 0xf9
+	!aes_dround01	%f16, %f60, %f62, %f56 ; \
+	!aes_dround23	%f18, %f60, %f62, %f58 ; \
+	!aes_dround01	%f20, %f56, %f58, %f60 ; \
+	!aes_dround23	%f22, %f56, %f58, %f62 ; \
+	!aes_dround01	%f24, %f60, %f62, %f56 ; \
+	!aes_dround23	%f26, %f60, %f62, %f58 ; \
+	!aes_dround01	%f28, %f56, %f58, %f60 ; \
+	!aes_dround23	%f30, %f56, %f58, %f62 ; \
+	!aes_dround01	%f32, %f60, %f62, %f56 ; \
+	!aes_dround23	%f34, %f60, %f62, %f58 ; \
+	!aes_dround01	%f36, %f56, %f58, %f60 ; \
+	!aes_dround23	%f38, %f56, %f58, %f62 ; \
+	!aes_dround01	%f40, %f60, %f62, %f56 ; \
+	!aes_dround23	%f42, %f60, %f62, %f58 ; \
+	!aes_dround01	%f44, %f56, %f58, %f60 ; \
+	!aes_dround23	%f46, %f56, %f58, %f62 ; \
+	!aes_dround01	%f48, %f60, %f62, %f56 ; \
+	!aes_dround23	%f50, %f60, %f62, %f58 ; \
+	!aes_dround01_l	%f52, %f56, %f58, %f60 ; \
+	!aes_dround23_l	%f54, %f56, %f58, %f62
+
+#define	TEN_DROUNDS_2 \
+	.byte	0x8c, 0xcc, 0x04, 0x40 ; \
+	.byte	0x88, 0xcc, 0x84, 0x60 ; \
+	.byte	0xb2, 0xcc, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xcc, 0xbe, 0x7d ; \
+	.byte	0x80, 0xcd, 0x08, 0x46 ; \
+	.byte	0x84, 0xcd, 0x88, 0x66 ; \
+	.byte	0xba, 0xcd, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcd, 0xb6, 0x79 ; \
+	.byte	0x8c, 0xce, 0x04, 0x40 ; \
+	.byte	0x88, 0xce, 0x84, 0x60 ; \
+	.byte	0xb2, 0xce, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xce, 0xbe, 0x7d ; \
+	.byte	0x80, 0xcf, 0x08, 0x46 ; \
+	.byte	0x84, 0xcf, 0x88, 0x66 ; \
+	.byte	0xba, 0xcf, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcf, 0xb6, 0x79 ; \
+	.byte	0x8c, 0xc8, 0x44, 0x40 ; \
+	.byte	0x88, 0xc8, 0xc4, 0x60 ; \
+	.byte	0xb2, 0xc8, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xc8, 0xfe, 0x7d ; \
+	.byte	0x80, 0xc9, 0x48, 0x46 ; \
+	.byte	0x84, 0xc9, 0xc8, 0x66 ; \
+	.byte	0xba, 0xc9, 0x76, 0x59 ; \
+	.byte	0xbe, 0xc9, 0xf6, 0x79 ; \
+	.byte	0x8c, 0xca, 0x44, 0x40 ; \
+	.byte	0x88, 0xca, 0xc4, 0x60 ; \
+	.byte	0xb2, 0xca, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xca, 0xfe, 0x7d ; \
+	.byte	0x80, 0xcb, 0x48, 0x46 ; \
+	.byte	0x84, 0xcb, 0xc8, 0x66 ; \
+	.byte	0xba, 0xcb, 0x76, 0x59 ; \
+	.byte	0xbe, 0xcb, 0xf6, 0x79 ; \
+	.byte	0x8c, 0xcc, 0x44, 0x40 ; \
+	.byte	0x88, 0xcc, 0xc4, 0x60 ; \
+	.byte	0xb2, 0xcc, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xcc, 0xfe, 0x7d ; \
+	.byte	0x80, 0xcd, 0x48, 0xc6 ; \
+	.byte	0x84, 0xcd, 0xc8, 0xe6 ; \
+	.byte	0xba, 0xcd, 0x76, 0xd9 ; \
+	.byte	0xbe, 0xcd, 0xf6, 0xf9
+	!aes_dround01	%f16, %f0, %f2, %f6 ; \
+	!aes_dround23	%f18, %f0, %f2, %f4 ; \
+	!aes_dround01	%f16, %f60, %f62, %f56 ; \
+	!aes_dround23	%f18, %f60, %f62, %f58 ; \
+	!aes_dround01	%f20, %f6, %f4, %f0 ; \
+	!aes_dround23	%f22, %f6, %f4, %f2 ; \
+	!aes_dround01	%f20, %f56, %f58, %f60 ; \
+	!aes_dround23	%f22, %f56, %f58, %f62 ; \
+	!aes_dround01	%f24, %f0, %f2, %f6 ; \
+	!aes_dround23	%f26, %f0, %f2, %f4 ; \
+	!aes_dround01	%f24, %f60, %f62, %f56 ; \
+	!aes_dround23	%f26, %f60, %f62, %f58 ; \
+	!aes_dround01	%f28, %f6, %f4, %f0 ; \
+	!aes_dround23	%f30, %f6, %f4, %f2 ; \
+	!aes_dround01	%f28, %f56, %f58, %f60 ; \
+	!aes_dround23	%f30, %f56, %f58, %f62 ; \
+	!aes_dround01	%f32, %f0, %f2, %f6 ; \
+	!aes_dround23	%f34, %f0, %f2, %f4 ; \
+	!aes_dround01	%f32, %f60, %f62, %f56 ; \
+	!aes_dround23	%f34, %f60, %f62, %f58 ; \
+	!aes_dround01	%f36, %f6, %f4, %f0 ; \
+	!aes_dround23	%f38, %f6, %f4, %f2 ; \
+	!aes_dround01	%f36, %f56, %f58, %f60 ; \
+	!aes_dround23	%f38, %f56, %f58, %f62 ; \
+	!aes_dround01	%f40, %f0, %f2, %f6 ; \
+	!aes_dround23	%f42, %f0, %f2, %f4 ; \
+	!aes_dround01	%f40, %f60, %f62, %f56 ; \
+	!aes_dround23	%f42, %f60, %f62, %f58 ; \
+	!aes_dround01	%f44, %f6, %f4, %f0 ; \
+	!aes_dround23	%f46, %f6, %f4, %f2 ; \
+	!aes_dround01	%f44, %f56, %f58, %f60 ; \
+	!aes_dround23	%f46, %f56, %f58, %f62 ; \
+	!aes_dround01	%f48, %f0, %f2, %f6 ; \
+	!aes_dround23	%f50, %f0, %f2, %f4 ; \
+	!aes_dround01	%f48, %f60, %f62, %f56 ; \
+	!aes_dround23	%f50, %f60, %f62, %f58 ; \
+	!aes_dround01_l	%f52, %f6, %f4, %f0 ; \
+	!aes_dround23_l	%f54, %f6, %f4, %f2 ; \
+	!aes_dround01_l	%f52, %f56, %f58, %f60 ; \
+	!aes_dround23_l	%f54, %f56, %f58, %f62
+
+#define	TWELVE_DROUNDS \
+	MID_TWO_DROUNDS	; \
+	TEN_DROUNDS
+
+#define	TWELVE_DROUNDS_2 \
+	MID_TWO_DROUNDS_2	; \
+	TEN_DROUNDS_2
+
+#define	FOURTEEN_DROUNDS \
+	FIRST_TWO_DROUNDS ; \
+	TWELVE_DROUNDS
+
+#define	FOURTEEN_DROUNDS_2 \
+	.byte	0xb0, 0xc8, 0x2c, 0x54 ; \
+	.byte	0xac, 0xc8, 0xac, 0x74 ; \
+	ldd	[%o0 + 0x80], %f20 ; \
+	.byte	0xb2, 0xc8, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xc8, 0xbe, 0x7d ; \
+	.byte	0x80, 0xc9, 0x2c, 0x58 ; \
+	.byte	0x84, 0xc9, 0xac, 0x78 ; \
+	ldd	[%o0 + 0x88], %f22 ; \
+	.byte	0xba, 0xc9, 0x36, 0x59 ; \
+	ldd	[%o0 + 0x70], %f24 ; \
+	.byte	0xbe, 0xc9, 0xb6, 0x79 ; \
+	.byte	0x8c, 0xca, 0x04, 0x40 ; \
+	.byte	0x88, 0xca, 0x84, 0x60 ; \
+	.byte	0xb2, 0xca, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xca, 0xbe, 0x7d ; \
+	.byte	0x80, 0xcb, 0x08, 0x46 ; \
+	.byte	0x84, 0xcb, 0x88, 0x66 ; \
+	.byte	0xba, 0xcb, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcb, 0xb6, 0x79 ; \
+	.byte	0x8c, 0xcc, 0x04, 0x40 ; \
+	.byte	0x88, 0xcc, 0x84, 0x60 ; \
+	.byte	0xb2, 0xcc, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xcc, 0xbe, 0x7d ; \
+	.byte	0x80, 0xcd, 0x08, 0x46 ; \
+	.byte	0x84, 0xcd, 0x88, 0x66 ; \
+	.byte	0xba, 0xcd, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcd, 0xb6, 0x79 ; \
+	.byte	0x8c, 0xce, 0x04, 0x40 ; \
+	.byte	0x88, 0xce, 0x84, 0x60 ; \
+	.byte	0xb2, 0xce, 0x3e, 0x5d ; \
+	.byte	0xb6, 0xce, 0xbe, 0x7d ; \
+	.byte	0x80, 0xcf, 0x08, 0x46 ; \
+	.byte	0x84, 0xcf, 0x88, 0x66 ; \
+	.byte	0xba, 0xcf, 0x36, 0x59 ; \
+	.byte	0xbe, 0xcf, 0xb6, 0x79 ; \
+	.byte	0x8c, 0xc8, 0x44, 0x40 ; \
+	.byte	0x88, 0xc8, 0xc4, 0x60 ; \
+	.byte	0xb2, 0xc8, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xc8, 0xfe, 0x7d ; \
+	.byte	0x80, 0xc9, 0x48, 0x46 ; \
+	.byte	0x84, 0xc9, 0xc8, 0x66 ; \
+	.byte	0xba, 0xc9, 0x76, 0x59 ; \
+	.byte	0xbe, 0xc9, 0xf6, 0x79 ; \
+	.byte	0x8c, 0xca, 0x44, 0x40 ; \
+	.byte	0x88, 0xca, 0xc4, 0x60 ; \
+	.byte	0xb2, 0xca, 0x7e, 0x5d ; \
+	.byte	0xb6, 0xca, 0xfe, 0x7d ; \
+	.byte	0x80, 0xcb, 0x48, 0x46 ; \
+	.byte	0x84, 0xcb, 0xc8, 0x66 ; \
+	.byte	0xba, 0xcb, 0x76, 0x59 ; \
+	.byte	0xbe, 0xcb, 0xf6, 0x79 ; \
+	.byte	0x8c, 0xcc, 0x44, 0x40 ; \
+	.byte	0x88, 0xcc, 0xc4, 0x60 ; \
+	ldd	[%o0 + 0xd0], %f0 ; \
+	.byte	0xb2, 0xcc, 0x7e, 0x5d ; \
+	ldd	[%o0 + 0xd8], %f2 ; \
+	.byte	0xb6, 0xcc, 0xfe, 0x7d ; \
+	.byte	0xa8, 0xcd, 0x48, 0xc6 ; \
+	.byte	0xac, 0xcd, 0xc8, 0xe6 ; \
+	ldd	[%o0 + 0xc0], %f4 ; \
+	.byte	0xba, 0xcd, 0x76, 0xd9 ; \
+	ldd	[%o0 + 0xc8], %f6 ; \
+	.byte	0xbe, 0xcd, 0xf6, 0xf9
+	!aes_dround01	%f0, %f20, %f22, %f24 ; \
+	!aes_dround23	%f2, %f20, %f22, %f22 ; \
+	!ldd	[%o0 + 0x80], %f20 ; \
+	!aes_dround01	%f0, %f60, %f62, %f56 ; \
+	!aes_dround23	%f2, %f60, %f62, %f58 ; \
+	!aes_dround01	%f4, %f24, %f22, %f0 ; \
+	!aes_dround23	%f6, %f24, %f22, %f2 ; \
+	!ldd	[%o0 + 0x88], %f22 ; \
+	!aes_dround01	%f4, %f56, %f58, %f60 ; \
+	!ldd	[%o0 + 0x70], %f24 ; \
+	!aes_dround23	%f6, %f56, %f58, %f62 ; \
+	!aes_dround01	%f8, %f0, %f2, %f6 ; \
+	!aes_dround23	%f10, %f0, %f2, %f4 ; \
+	!aes_dround01	%f8, %f60, %f62, %f56 ; \
+	!aes_dround23	%f10, %f60, %f62, %f58 ; \
+	!aes_dround01	%f12, %f6, %f4, %f0 ; \
+	!aes_dround23	%f14, %f6, %f4, %f2 ; \
+	!aes_dround01	%f12, %f56, %f58, %f60 ; \
+	!aes_dround23	%f14, %f56, %f58, %f62 ; \
+	!aes_dround01	%f16, %f0, %f2, %f6 ; \
+	!aes_dround23	%f18, %f0, %f2, %f4 ; \
+	!aes_dround01	%f16, %f60, %f62, %f56 ; \
+	!aes_dround23	%f18, %f60, %f62, %f58 ; \
+	!aes_dround01	%f20, %f6, %f4, %f0 ; \
+	!aes_dround23	%f22, %f6, %f4, %f2 ; \
+	!aes_dround01	%f20, %f56, %f58, %f60 ; \
+	!aes_dround23	%f22, %f56, %f58, %f62 ; \
+	!aes_dround01	%f24, %f0, %f2, %f6 ; \
+	!aes_dround23	%f26, %f0, %f2, %f4 ; \
+	!aes_dround01	%f24, %f60, %f62, %f56 ; \
+	!aes_dround23	%f26, %f60, %f62, %f58 ; \
+	!aes_dround01	%f28, %f6, %f4, %f0 ; \
+	!aes_dround23	%f30, %f6, %f4, %f2 ; \
+	!aes_dround01	%f28, %f56, %f58, %f60 ; \
+	!aes_dround23	%f30, %f56, %f58, %f62 ; \
+	!aes_dround01	%f32, %f0, %f2, %f6 ; \
+	!aes_dround23	%f34, %f0, %f2, %f4 ; \
+	!aes_dround01	%f32, %f60, %f62, %f56 ; \
+	!aes_dround23	%f34, %f60, %f62, %f58 ; \
+	!aes_dround01	%f36, %f6, %f4, %f0 ; \
+	!aes_dround23	%f38, %f6, %f4, %f2 ; \
+	!aes_dround01	%f36, %f56, %f58, %f60 ; \
+	!aes_dround23	%f38, %f56, %f58, %f62 ; \
+	!aes_dround01	%f40, %f0, %f2, %f6 ; \
+	!aes_dround23	%f42, %f0, %f2, %f4 ; \
+	!aes_dround01	%f40, %f60, %f62, %f56 ; \
+	!aes_dround23	%f42, %f60, %f62, %f58 ; \
+	!aes_dround01	%f44, %f6, %f4, %f0 ; \
+	!aes_dround23	%f46, %f6, %f4, %f2 ; \
+	!aes_dround01	%f44, %f56, %f58, %f60 ; \
+	!aes_dround23	%f46, %f56, %f58, %f62 ; \
+	!aes_dround01	%f48, %f0, %f2, %f6 ; \
+	!aes_dround23	%f50, %f0, %f2, %f4 ; \
+	!ldd	[%o0 + 0xd0], %f0 ; \
+	!aes_dround01	%f48, %f60, %f62, %f56 ; \
+	!ldd	[%o0 + 0xd8], %f2 ; \
+	!aes_dround23	%f50, %f60, %f62, %f58 ; \
+	!aes_dround01_l	%f52, %f6, %f4, %f20 ; \
+	!aes_dround23_l	%f54, %f6, %f4, %f22 ; \
+	!ldd	[%o0 + 0xc0], %f4 ; \
+	!aes_dround01_l	%f52, %f56, %f58, %f60 ; \
+	!ldd	[%o0 + 0xc8], %f6 ; \
+	!aes_dround23_l	%f54, %f56, %f58, %f62
+
+
+	ENTRY(t4_aes128_load_keys_for_encrypt)
+
+	ldd	[%o0 + 0x10], %f16
+	ldd	[%o0 + 0x18], %f18
+	ldd	[%o0 + 0x20], %f20
+	ldd	[%o0 + 0x28], %f22
+	ldd	[%o0 + 0x30], %f24
+	ldd	[%o0 + 0x38], %f26
+	ldd	[%o0 + 0x40], %f28
+	ldd	[%o0 + 0x48], %f30
+	ldd	[%o0 + 0x50], %f32
+	ldd	[%o0 + 0x58], %f34
+	ldd	[%o0 + 0x60], %f36
+	ldd	[%o0 + 0x68], %f38
+	ldd	[%o0 + 0x70], %f40
+	ldd	[%o0 + 0x78], %f42
+	ldd	[%o0 + 0x80], %f44
+	ldd	[%o0 + 0x88], %f46
+	ldd	[%o0 + 0x90], %f48
+	ldd	[%o0 + 0x98], %f50
+	ldd	[%o0 + 0xa0], %f52
+	retl
+	ldd	[%o0 + 0xa8], %f54
+
+	SET_SIZE(t4_aes128_load_keys_for_encrypt)
+
+
+	ENTRY(t4_aes192_load_keys_for_encrypt)
+
+	ldd	[%o0 + 0x10], %f8
+	ldd	[%o0 + 0x18], %f10
+	ldd	[%o0 + 0x20], %f12
+	ldd	[%o0 + 0x28], %f14
+	ldd	[%o0 + 0x30], %f16
+	ldd	[%o0 + 0x38], %f18
+	ldd	[%o0 + 0x40], %f20
+	ldd	[%o0 + 0x48], %f22
+	ldd	[%o0 + 0x50], %f24
+	ldd	[%o0 + 0x58], %f26
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f32
+	ldd	[%o0 + 0x78], %f34
+	ldd	[%o0 + 0x80], %f36
+	ldd	[%o0 + 0x88], %f38
+	ldd	[%o0 + 0x90], %f40
+	ldd	[%o0 + 0x98], %f42
+	ldd	[%o0 + 0xa0], %f44
+	ldd	[%o0 + 0xa8], %f46
+	ldd	[%o0 + 0xb0], %f48
+	ldd	[%o0 + 0xb8], %f50
+	ldd	[%o0 + 0xc0], %f52
+	retl
+	ldd	[%o0 + 0xc8], %f54
+
+	SET_SIZE(t4_aes192_load_keys_for_encrypt)
+
+
+	ENTRY(t4_aes256_load_keys_for_encrypt)
+
+	ldd	[%o0 + 0x10], %f0
+	ldd	[%o0 + 0x18], %f2
+	ldd	[%o0 + 0x20], %f4
+	ldd	[%o0 + 0x28], %f6
+	ldd	[%o0 + 0x30], %f8
+	ldd	[%o0 + 0x38], %f10
+	ldd	[%o0 + 0x40], %f12
+	ldd	[%o0 + 0x48], %f14
+	ldd	[%o0 + 0x50], %f16
+	ldd	[%o0 + 0x58], %f18
+	ldd	[%o0 + 0x60], %f20
+	ldd	[%o0 + 0x68], %f22
+	ldd	[%o0 + 0x70], %f24
+	ldd	[%o0 + 0x78], %f26
+	ldd	[%o0 + 0x80], %f28
+	ldd	[%o0 + 0x88], %f30
+	ldd	[%o0 + 0x90], %f32
+	ldd	[%o0 + 0x98], %f34
+	ldd	[%o0 + 0xa0], %f36
+	ldd	[%o0 + 0xa8], %f38
+	ldd	[%o0 + 0xb0], %f40
+	ldd	[%o0 + 0xb8], %f42
+	ldd	[%o0 + 0xc0], %f44
+	ldd	[%o0 + 0xc8], %f46
+	ldd	[%o0 + 0xd0], %f48
+	ldd	[%o0 + 0xd8], %f50
+	ldd	[%o0 + 0xe0], %f52
+	retl
+	ldd	[%o0 + 0xe8], %f54
+
+	SET_SIZE(t4_aes256_load_keys_for_encrypt)
+
+
+#define	TEST_PARALLEL_ECB_ENCRYPT
+#ifdef	TEST_PARALLEL_ECB_ENCRYPT
+	ENTRY(t4_aes128_ecb_encrypt)
+
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %o4
+	brz	%o4, ecbenc128_loop
+	nop
+
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	TEN_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ecbenc128_loop_end
+	add	%o2, 16, %o2
+
+ecbenc128_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f0
+	movxtod	%g4, %f2
+	ldx	[%o1 + 16], %g3	!input
+	ldx	[%o1 + 24], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	TEN_EROUNDS_2
+
+	std	%f0, [%o2]
+	std	%f2, [%o2 + 8]
+
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ecbenc128_loop
+	add	%o2, 32, %o2
+ecbenc128_loop_end:
+	retl
+	nop
+
+	SET_SIZE(t4_aes128_ecb_encrypt)
+
+
+	ENTRY(t4_aes192_ecb_encrypt)
+
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %o4
+	brz	%o4, ecbenc192_loop
+	nop
+
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	TWELVE_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ecbenc192_loop_end
+	add	%o2, 16, %o2
+
+ecbenc192_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f0
+	movxtod	%g4, %f2
+	ldx	[%o1 + 16], %g3	!input
+	ldx	[%o1 + 24], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	TWELVE_EROUNDS_2
+
+	std	%f0, [%o2]
+	std	%f2, [%o2 + 8]
+
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ecbenc192_loop
+	add	%o2, 32, %o2
+ecbenc192_loop_end:
+	retl
+	nop
+
+	SET_SIZE(t4_aes192_ecb_encrypt)
+
+
+	ENTRY(t4_aes256_ecb_encrypt)
+
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %o4
+	brz	%o4, ecbenc256_loop
+	nop
+
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	FOURTEEN_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ecbenc256_loop_end
+	add	%o2, 16, %o2
+
+ecbenc256_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f20
+	movxtod	%g4, %f22
+	ldx	[%o1 + 16], %g3	!input
+	ldx	[%o1 + 24], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	FOURTEEN_EROUNDS_2
+
+	std	%f20, [%o2]
+	std	%f22, [%o2 + 8]
+
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ecbenc256_loop
+	add	%o2, 32, %o2
+
+	ldd	[%o0 + 0x60], %f20
+	ldd	[%o0 + 0x68], %f22
+
+ecbenc256_loop_end:
+	retl
+	nop
+
+	SET_SIZE(t4_aes256_ecb_encrypt)
+
+#else
+
+	ENTRY(t4_aes128_ecb_encrypt)
+
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+ecbenc128_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	TEN_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ecbenc128_loop
+	add	%o2, 16, %o2
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes128_ecb_encrypt)
+
+
+	ENTRY(t4_aes192_ecb_encrypt)
+
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+ecbenc192_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	TWELVE_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ecbenc192_loop
+	add	%o2, 16, %o2
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes192_ecb_encrypt)
+
+
+	ENTRY(t4_aes256_ecb_encrypt)
+
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+ecbenc256_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f60
+	movxtod	%g4, %f62
+
+	FOURTEEN_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ecbenc256_loop
+	add	%o2, 16, %o2
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes256_ecb_encrypt)
+#endif
+
+
+	ENTRY(t4_aes128_cbc_encrypt)
+
+	ldd	[%o4], %f60	! IV
+	ldd	[%o4 +8], %f62	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cbcenc128_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f56
+	movxtod	%g4, %f58
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	TEN_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cbcenc128_loop
+	add	%o2, 16, %o2
+
+	std	%f60, [%o4]
+	retl
+	std	%f62, [%o4 + 8]
+
+	SET_SIZE(t4_aes128_cbc_encrypt)
+
+
+	ENTRY(t4_aes192_cbc_encrypt)
+
+	ldd	[%o4], %f60	! IV
+	ldd	[%o4 + 8], %f62	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cbcenc192_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f56
+	movxtod	%g4, %f58
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	TWELVE_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cbcenc192_loop
+	add	%o2, 16, %o2
+
+	std	%f60, [%o4]
+	retl
+	std	%f62, [%o4 + 8]
+
+	SET_SIZE(t4_aes192_cbc_encrypt)
+
+
+	ENTRY(t4_aes256_cbc_encrypt)
+
+	ldd	[%o4], %f60	! IV
+	ldd	[%o4 + 8], %f62	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cbcenc256_loop:
+	ldx	[%o1], %g3	!input
+	ldx	[%o1 + 8], %g4	!input
+	xor	%g1, %g3, %g3	!input ^ ks[0-1]
+	xor	%g2, %g4, %g4	!input ^ ks[0-1]
+	movxtod	%g3, %f56
+	movxtod	%g4, %f58
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	FOURTEEN_EROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cbcenc256_loop
+	add	%o2, 16, %o2
+
+	std	%f60, [%o4]
+	retl
+	std	%f62, [%o4 + 8]
+
+	SET_SIZE(t4_aes256_cbc_encrypt)
+
+
+#define	 TEST_PARALLEL_CTR_CRYPT
+#ifdef	TEST_PARALLEL_CTR_CRYPT
+	ENTRY(t4_aes128_ctr_crypt)
+
+	ldx	[%o4], %g3	! IV
+	ldx	[%o4 +8], %g4	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %g5
+	brz, %g5, ctr128_loop
+
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	TEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ctr128_loop_end
+	add	%o2, 16, %o2
+
+ctr128_loop:
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f0
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f2
+	inc	%g4
+
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	TEN_EROUNDS_2
+
+	ldd	[%o1], %f6		!input
+	ldd	[%o1 + 8], %f4		!input
+	ldd	[%o1 + 16], %f56	!input
+	ldd	[%o1 + 24], %f58	!input
+	fxor	%f0, %f6, %f0
+	fxor	%f2, %f4, %f2
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f0, [%o2]
+	std	%f2, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ctr128_loop
+	add	%o2, 32, %o2
+
+ctr128_loop_end:
+	stx	%g3, [%o4]
+	retl
+	stx	%g4, [%o4 + 8]
+
+	SET_SIZE(t4_aes128_ctr_crypt)
+
+
+	ENTRY(t4_aes192_ctr_crypt)
+
+	ldx	[%o4], %g3	! IV
+	ldx	[%o4 +8], %g4	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %g5
+	brz, %g5, ctr192_loop
+
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	TWELVE_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ctr192_loop_end
+	add	%o2, 16, %o2
+
+ctr192_loop:
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f0
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f2
+	inc	%g4
+
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	TWELVE_EROUNDS_2
+
+	ldd	[%o1], %f6		!input
+	ldd	[%o1 + 8], %f4		!input
+	ldd	[%o1 + 16], %f56	!input
+	ldd	[%o1 + 24], %f58	!input
+	fxor	%f0, %f6, %f0
+	fxor	%f2, %f4, %f2
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f0, [%o2]
+	std	%f2, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ctr192_loop
+	add	%o2, 32, %o2
+
+ctr192_loop_end:
+	stx	%g3, [%o4]
+	retl
+	stx	%g4, [%o4 + 8]
+
+	SET_SIZE(t4_aes192_ctr_crypt)
+
+
+	ENTRY(t4_aes256_ctr_crypt)
+
+	ldx	[%o4], %g3	! IV
+	ldx	[%o4 +8], %g4	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %g5
+	brz,	%g5, ctr256_loop
+
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	FOURTEEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ctr256_loop_end
+	add	%o2, 16, %o2
+
+ctr256_loop:
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f20
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f22
+	inc	%g4
+
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	FOURTEEN_EROUNDS_2
+
+	ldd	[%o1], %f56		!input
+	ldd	[%o1 + 8], %f58		!input
+	fxor	%f20, %f56, %f20
+	fxor	%f22, %f58, %f22
+	ldd	[%o1 + 16], %f56	!input
+	ldd	[%o1 + 24], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f20, [%o2]
+	std	%f22, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ctr256_loop
+	add	%o2, 32, %o2
+
+	ldd	[%o0 + 0x60], %f20
+	ldd	[%o0 + 0x68], %f22
+
+ctr256_loop_end:
+	stx	%g3, [%o4]
+	retl
+	stx	%g4, [%o4 + 8]
+
+	SET_SIZE(t4_aes256_ctr_crypt)
+
+#else
+
+	ENTRY(t4_aes128_ctr_crypt)
+
+	ldx	[%o4], %g3	! IV
+	ldx	[%o4 +8], %g4	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+ctr128_loop:
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	TEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ctr128_loop
+	add	%o2, 16, %o2
+
+	stx	%g3, [%o4]
+	retl
+	stx	%g4, [%o4 + 8]
+
+	SET_SIZE(t4_aes128_ctr_crypt)
+
+	ENTRY(t4_aes192_ctr_crypt)
+
+	ldx	[%o4], %g3	! IV
+	ldx	[%o4 +8], %g4	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+ctr192_loop:
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	TWELVE_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ctr192_loop
+	add	%o2, 16, %o2
+
+	stx	%g3, [%o4]
+	retl
+	stx	%g4, [%o4 + 8]
+
+	SET_SIZE(t4_aes192_ctr_crypt)
+
+
+	ENTRY(t4_aes256_ctr_crypt)
+
+	ldx	[%o4], %g3	! IV
+	ldx	[%o4 +8], %g4	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+ctr256_loop:
+	xor	%g1, %g3, %g5
+	movxtod	%g5, %f60
+	xor	%g2, %g4, %g5
+	movxtod	%g5, %f62
+	inc	%g4
+
+	FOURTEEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ctr256_loop
+	add	%o2, 16, %o2
+
+	stx	%g3, [%o4]
+	retl
+	stx	%g4, [%o4 + 8]
+
+	SET_SIZE(t4_aes256_ctr_crypt)
+
+#endif
+
+	ENTRY(t4_aes128_cfb128_encrypt)
+
+	ldd	[%o4], %f60	! IV
+	ldd	[%o4 +8], %f62	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cfb128_128_loop:
+	movxtod	%g1, %f56
+	movxtod	%g2, %f58
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	TEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cfb128_128_loop
+	add	%o2, 16, %o2
+
+	std	%f60, [%o4]
+	retl
+	std	%f62, [%o4 + 8]
+
+	SET_SIZE(t4_aes128_cfb128_encrypt)
+
+
+	ENTRY(t4_aes192_cfb128_encrypt)
+
+	ldd	[%o4], %f60	! IV
+	ldd	[%o4 +8], %f62	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cfb128_192_loop:
+	movxtod	%g1, %f56
+	movxtod	%g2, %f58
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	TWELVE_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cfb128_192_loop
+	add	%o2, 16, %o2
+
+	std	%f60, [%o4]
+	retl
+	std	%f62, [%o4 + 8]
+
+	SET_SIZE(t4_aes192_cfb128_encrypt)
+
+
+	ENTRY(t4_aes256_cfb128_encrypt)
+
+	ldd	[%o4], %f60	! IV
+	ldd	[%o4 +8], %f62	! IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cfb128_256_loop:
+	movxtod	%g1, %f56
+	movxtod	%g2, %f58
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	FOURTEEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cfb128_256_loop
+	add	%o2, 16, %o2
+
+	std	%f60, [%o4]
+	retl
+	std	%f62, [%o4 + 8]
+
+	SET_SIZE(t4_aes256_cfb128_encrypt)
+
+
+	ENTRY(t4_aes128_load_keys_for_decrypt)
+
+	ldd	[%o0], %f52
+	ldd	[%o0 + 0x8], %f54
+	ldd	[%o0 + 0x10], %f48
+	ldd	[%o0 + 0x18], %f50
+	ldd	[%o0 + 0x20], %f44
+	ldd	[%o0 + 0x28], %f46
+	ldd	[%o0 + 0x30], %f40
+	ldd	[%o0 + 0x38], %f42
+	ldd	[%o0 + 0x40], %f36
+	ldd	[%o0 + 0x48], %f38
+	ldd	[%o0 + 0x50], %f32
+	ldd	[%o0 + 0x58], %f34
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f24
+	ldd	[%o0 + 0x78], %f26
+	ldd	[%o0 + 0x80], %f20
+	ldd	[%o0 + 0x88], %f22
+	ldd	[%o0 + 0x90], %f16
+	retl
+	ldd	[%o0 + 0x98], %f18
+
+	SET_SIZE(t4_aes128_load_keys_for_decrypt)
+
+
+	ENTRY(t4_aes192_load_keys_for_decrypt)
+
+	ldd	[%o0], %f52
+	ldd	[%o0 + 0x8], %f54
+	ldd	[%o0 + 0x10], %f48
+	ldd	[%o0 + 0x18], %f50
+	ldd	[%o0 + 0x20], %f44
+	ldd	[%o0 + 0x28], %f46
+	ldd	[%o0 + 0x30], %f40
+	ldd	[%o0 + 0x38], %f42
+	ldd	[%o0 + 0x40], %f36
+	ldd	[%o0 + 0x48], %f38
+	ldd	[%o0 + 0x50], %f32
+	ldd	[%o0 + 0x58], %f34
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f24
+	ldd	[%o0 + 0x78], %f26
+	ldd	[%o0 + 0x80], %f20
+	ldd	[%o0 + 0x88], %f22
+	ldd	[%o0 + 0x90], %f16
+	ldd	[%o0 + 0x98], %f18
+	ldd	[%o0 + 0xa0], %f12
+	ldd	[%o0 + 0xa8], %f14
+	ldd	[%o0 + 0xb0], %f8
+	retl
+	ldd	[%o0 + 0xb8], %f10
+
+	SET_SIZE(t4_aes192_load_keys_for_decrypt)
+
+
+	ENTRY(t4_aes256_load_keys_for_decrypt)
+
+
+	ldd	[%o0], %f52
+	ldd	[%o0 + 0x8], %f54
+	ldd	[%o0 + 0x10], %f48
+	ldd	[%o0 + 0x18], %f50
+	ldd	[%o0 + 0x20], %f44
+	ldd	[%o0 + 0x28], %f46
+	ldd	[%o0 + 0x30], %f40
+	ldd	[%o0 + 0x38], %f42
+	ldd	[%o0 + 0x40], %f36
+	ldd	[%o0 + 0x48], %f38
+	ldd	[%o0 + 0x50], %f32
+	ldd	[%o0 + 0x58], %f34
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f24
+	ldd	[%o0 + 0x78], %f26
+	ldd	[%o0 + 0x80], %f20
+	ldd	[%o0 + 0x88], %f22
+	ldd	[%o0 + 0x90], %f16
+	ldd	[%o0 + 0x98], %f18
+	ldd	[%o0 + 0xa0], %f12
+	ldd	[%o0 + 0xa8], %f14
+	ldd	[%o0 + 0xb0], %f8
+	ldd	[%o0 + 0xb8], %f10
+	ldd	[%o0 + 0xc0], %f4
+	ldd	[%o0 + 0xc8], %f6
+	ldd	[%o0 + 0xd0], %f0
+	retl
+	ldd	[%o0 + 0xd8], %f2
+
+	SET_SIZE(t4_aes256_load_keys_for_decrypt)
+
+
+#define	 TEST_PARALLEL_ECB_DECRYPT
+#ifdef	TEST_PARALLEL_ECB_DECRYPT
+	ENTRY(t4_aes128_ecb_decrypt)
+
+	ldx	[%o0 + 0xa0], %g1	!ks[last-1]
+	ldx	[%o0 + 0xa8], %g2	!ks[last]
+	and	%o3, 16, %o4
+	brz	%o4, ecbdec128_loop
+	nop
+
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	TEN_DROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 0x8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ecbdec128_loop_end
+	add	%o2, 16, %o2
+
+ecbdec128_loop:
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f0
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f2
+	ldx	[%o1 + 16], %o4
+	ldx	[%o1 + 24], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	TEN_DROUNDS_2
+
+	std	%f0, [%o2]
+	std	%f2, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ecbdec128_loop
+	add	%o2, 32, %o2
+ecbdec128_loop_end:
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes128_ecb_decrypt)
+
+	ENTRY(t4_aes192_ecb_decrypt)
+
+	ldx	[%o0 + 0xc0], %g1	!ks[last-1]
+	ldx	[%o0 + 0xc8], %g2	!ks[last]
+	and	%o3, 16, %o4
+	brz	%o4, ecbdec192_loop
+	nop
+
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	TWELVE_DROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 0x8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ecbdec192_loop_end
+	add	%o2, 16, %o2
+
+ecbdec192_loop:
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f0
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f2
+	ldx	[%o1 + 16], %o4
+	ldx	[%o1 + 24], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	TWELVE_DROUNDS_2
+
+	std	%f0, [%o2]
+	std	%f2, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ecbdec192_loop
+	add	%o2, 32, %o2
+ecbdec192_loop_end:
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes192_ecb_decrypt)
+
+
+	ENTRY(t4_aes256_ecb_decrypt)
+
+	ldx	[%o0 + 0xe0], %g1	!ks[last-1]
+	ldx	[%o0 + 0xe8], %g2	!ks[last]
+	and	%o3, 16, %o4
+	brz	%o4, ecbdec256_loop
+	nop
+
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	FOURTEEN_DROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 0x8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	ecbdec256_loop_end
+	add	%o2, 16, %o2
+
+ecbdec256_loop:
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f20
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f22
+	ldx	[%o1 + 16], %o4
+	ldx	[%o1 + 24], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	FOURTEEN_DROUNDS_2
+
+	std	%f20, [%o2]
+	std	%f22, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	ecbdec256_loop
+	add	%o2, 32, %o2
+
+	ldd	[%o0 + 0x80], %f20
+	ldd	[%o0 + 0x88], %f22
+
+ecbdec256_loop_end:
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes256_ecb_decrypt)
+
+#else
+
+	ENTRY(t4_aes128_ecb_decrypt)
+
+	ldx	[%o0 + 0xa0], %g1	!ks[last-1]
+	ldx	[%o0 + 0xa8], %g2	!ks[last]
+
+ecbdec128_loop:
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	TEN_DROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 0x8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ecbdec128_loop
+	add	%o2, 16, %o2
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes128_ecb_decrypt)
+
+
+	ENTRY(t4_aes192_ecb_decrypt)
+
+	ldx	[%o0 + 0xc0], %g1	!ks[last-1]
+	ldx	[%o0 + 0xc8], %g2	!ks[last]
+
+ecbdec192_loop:
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	TWELVE_DROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 0x8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ecbdec192_loop
+	add	%o2, 16, %o2
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes192_ecb_decrypt)
+
+
+	ENTRY(t4_aes256_ecb_decrypt)
+
+	ldx	[%o0 + 0xe0], %g1	!ks[last-1]
+	ldx	[%o0 + 0xe8], %g2	!ks[last]
+
+ecbdec256_loop:
+	ldx	[%o1], %o4
+	ldx	[%o1 + 8], %o5
+	xor	%g1, %o4, %g3	!initial ARK
+	movxtod	%g3, %f60
+	xor	%g2, %o5, %g3	!initial ARK
+	movxtod	%g3, %f62
+
+	FOURTEEN_DROUNDS
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 0x8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	ecbdec256_loop
+	add	%o2, 16, %o2
+
+	retl
+	nop
+
+	SET_SIZE(t4_aes256_ecb_decrypt)
+
+#endif
+
+#define	TEST_PARALLEL_CBC_DECRYPT
+#ifdef	EST_PARALLEL_CBC_DECRYPT
+		ENTRY(t4_aes128_cbc_decrypt)
+
+	save	%sp, -SA(MINFRAME), %sp
+	ldx	[%i4], %o0		!IV
+	ldx	[%i4 + 8], %o1		!IV
+	ldx	[%i0 + 0xa0], %o2	!ks[last-1]
+	ldx	[%i0 + 0xa8], %o3	!ks[last]
+	and	%i3, 16, %o4
+	brz	%o4, cbcdec128_loop
+	nop
+
+	ldx	[%i1], %o4
+	ldx	[%i1 + 8], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	TEN_DROUNDS
+
+	movxtod	%o0, %f56
+	movxtod	%o1, %f58
+	mov	%o4, %o0	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2]
+	std	%f62, [%i2 + 0x8]
+
+	add	%i1, 16, %i1
+	subcc	%i3, 16, %i3
+	be	cbcdec128_loop_end
+	add	%i2, 16, %i2
+
+
+cbcdec128_loop:
+	ldx	[%i1], %g4
+	ldx	[%i1 + 8], %g5
+	xor	%o2, %g4, %g1	!initial ARK
+	movxtod	%g1, %f0
+	xor	%o3, %g5, %g1	!initial ARK
+	movxtod	%g1, %f2
+
+	ldx	[%i1 + 16], %o4
+	ldx	[%i1 + 24], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	TEN_DROUNDS_2
+
+	movxtod	%o0, %f6
+	movxtod	%o1, %f4
+	fxor	%f6, %f0, %f0	!add in previous IV
+	fxor	%f4, %f2, %f2
+
+	std	%f0, [%i2]
+	std	%f2, [%i2 + 8]
+
+	movxtod	%g4, %f56
+	movxtod	%g5, %f58
+	mov	%o4, %o0	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2 + 16]
+	std	%f62, [%i2 + 24]
+
+	add	%i1, 32, %i1
+	subcc	%i3, 32, %i3
+	bne	cbcdec128_loop
+	add	%i2, 32, %i2
+
+cbcdec128_loop_end:
+	stx	%o0, [%i4]
+	stx	%o1, [%i4 + 8]
+	ret
+	restore
+
+	SET_SIZE(t4_aes128_cbc_decrypt)
+
+
+	ENTRY(t4_aes192_cbc_decrypt)
+
+	save	%sp, -SA(MINFRAME), %sp
+	ldx	[%i4], %o0		!IV
+	ldx	[%i4 + 8], %o1		!IV
+	ldx	[%i0 + 0xc0], %o2	!ks[last-1]
+	ldx	[%i0 + 0xc8], %o3	!ks[last]
+	and	%i3, 16, %o4
+	brz	%o4, cbcdec192_loop
+	nop
+
+	ldx	[%i1], %o4
+	ldx	[%i1 + 8], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	TWELVE_DROUNDS
+
+	movxtod	%o0, %f56
+	movxtod	%o1, %f58
+	mov	%o4, %o0	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2]
+	std	%f62, [%i2 + 0x8]
+
+	add	%i1, 16, %i1
+	subcc	%i3, 16, %i3
+	be	cbcdec192_loop_end
+	add	%i2, 16, %i2
+
+
+cbcdec192_loop:
+	ldx	[%i1], %g4
+	ldx	[%i1 + 8], %g5
+	xor	%o2, %g4, %g1	!initial ARK
+	movxtod	%g1, %f0
+	xor	%o3, %g5, %g1	!initial ARK
+	movxtod	%g1, %f2
+
+	ldx	[%i1 + 16], %o4
+	ldx	[%i1 + 24], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	TWELVE_DROUNDS_2
+
+	movxtod	%o0, %f6
+	movxtod	%o1, %f4
+	fxor	%f6, %f0, %f0	!add in previous IV
+	fxor	%f4, %f2, %f2
+
+	std	%f0, [%i2]
+	std	%f2, [%i2 + 8]
+
+	movxtod	%g4, %f56
+	movxtod	%g5, %f58
+	mov	%o4, %o0	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2 + 16]
+	std	%f62, [%i2 + 24]
+
+	add	%i1, 32, %i1
+	subcc	%i3, 32, %i3
+	bne	cbcdec192_loop
+	add	%i2, 32, %i2
+
+cbcdec192_loop_end:
+	stx	%o0, [%i4]
+	stx	%o1, [%i4 + 8]
+	ret
+	restore
+
+	SET_SIZE(t4_aes192_cbc_decrypt)
+
+
+	ENTRY(t4_aes256_cbc_decrypt)
+
+	save	%sp, -SA(MINFRAME), %sp
+	mov	%i0, %o0		!FOURTEEN_DROUNDS uses %o0
+	ldx	[%i4], %g2		!IV
+	ldx	[%i4 + 8], %o1		!IV
+	ldx	[%o0 + 0xe0], %o2	!ks[last-1]
+	ldx	[%o0 + 0xe8], %o3	!ks[last]
+	and	%i3, 16, %o4
+	brz	%o4, cbcdec256_loop
+	nop
+
+	ldx	[%i1], %o4
+	ldx	[%i1 + 8], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	FOURTEEN_DROUNDS
+
+	movxtod	%g2, %f56
+	movxtod	%o1, %f58
+	mov	%o4, %g2	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2]
+	std	%f62, [%i2 + 0x8]
+
+	add	%i1, 16, %i1
+	subcc	%i3, 16, %i3
+	be	cbcdec256_loop_end
+	add	%i2, 16, %i2
+
+
+cbcdec256_loop:
+	ldx	[%i1], %g4
+	ldx	[%i1 + 8], %g5
+	xor	%o2, %g4, %g1	!initial ARK
+	movxtod	%g1, %f20
+	xor	%o3, %g5, %g1	!initial ARK
+	movxtod	%g1, %f22
+
+	ldx	[%i1 + 16], %o4
+	ldx	[%i1 + 24], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	FOURTEEN_DROUNDS_2
+
+	movxtod	%g2, %f56
+	movxtod	%o1, %f58
+	fxor	%f56, %f20, %f20	!add in previous IV
+	fxor	%f58, %f22, %f22
+
+	std	%f20, [%i2]
+	std	%f22, [%i2 + 8]
+
+	movxtod	%g4, %f56
+	movxtod	%g5, %f58
+	mov	%o4, %g2	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2 + 16]
+	std	%f62, [%i2 + 24]
+
+	add	%i1, 32, %i1
+	subcc	%i3, 32, %i3
+	bne	cbcdec256_loop
+	add	%i2, 32, %i2
+
+	ldd	[%o0 + 0x80], %f20
+	ldd	[%o0 + 0x88], %f22
+
+cbcdec256_loop_end:
+	stx	%g2, [%i4]
+	stx	%o1, [%i4 + 8]
+	ret
+	restore
+
+	SET_SIZE(t4_aes256_cbc_decrypt)
+
+#else
+
+	ENTRY(t4_aes128_cbc_decrypt)
+
+	save	%sp, -SA(MINFRAME), %sp
+	ldx	[%i4], %o0		!IV
+	ldx	[%i4 + 8], %o1		!IV
+	ldx	[%i0 + 0xa0], %o2	!ks[last-1]
+	ldx	[%i0 + 0xa8], %o3	!ks[last]
+
+cbcdec128_loop:
+	ldx	[%i1], %o4
+	ldx	[%i1 + 8], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	TEN_DROUNDS
+
+	movxtod	%o0, %f56
+	movxtod	%o1, %f58
+	mov	%o4, %o0	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2]
+	std	%f62, [%i2 + 0x8]
+
+	add	%i1, 16, %i1
+	subcc	%i3, 16, %i3
+	bne	cbcdec128_loop
+	add	%i2, 16, %i2
+
+	stx	%o0, [%i4]
+	stx	%o1, [%i4 + 8]
+	ret
+	restore
+
+	SET_SIZE(t4_aes128_cbc_decrypt)
+
+
+	ENTRY(t4_aes192_cbc_decrypt)
+
+	save	%sp, -SA(MINFRAME), %sp
+	ldx	[%i4], %o0		!IV
+	ldx	[%i4 + 8], %o1		!IV
+	ldx	[%i0 + 0xc0], %o2	!ks[last-1]
+	ldx	[%i0 + 0xc8], %o3	!ks[last]
+
+cbcdec192_loop:
+	ldx	[%i1], %o4
+	ldx	[%i1 + 8], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	TWELVE_DROUNDS
+
+	movxtod	%o0, %f56
+	movxtod	%o1, %f58
+	mov	%o4, %o0	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2]
+	std	%f62, [%i2 + 0x8]
+
+	add	%i1, 16, %i1
+	subcc	%i3, 16, %i3
+	bne	cbcdec192_loop
+	add	%i2, 16, %i2
+
+	stx	%o0, [%i4]
+	stx	%o1, [%i4 + 8]
+	ret
+	restore
+
+	SET_SIZE(t4_aes192_cbc_decrypt)
+
+
+	ENTRY(t4_aes256_cbc_decrypt)
+
+	save	%sp, -SA(MINFRAME), %sp
+	ldx	[%i4], %o0		!IV
+	ldx	[%i4 + 8], %o1		!IV
+	ldx	[%i0 + 0xe0], %o2	!ks[last-1]
+	ldx	[%i0 + 0xe8], %o3	!ks[last]
+
+cbcdec256_loop:
+	ldx	[%i1], %o4
+	ldx	[%i1 + 8], %o5
+	xor	%o2, %o4, %g1	!initial ARK
+	movxtod	%g1, %f60
+	xor	%o3, %o5, %g1	!initial ARK
+	movxtod	%g1, %f62
+
+	FOURTEEN_DROUNDS
+
+	movxtod	%o0, %f56
+	movxtod	%o1, %f58
+	mov	%o4, %o0	!save last block as next IV
+	mov	%o5, %o1
+	fxor	%f56, %f60, %f60	!add in previous IV
+	fxor	%f58, %f62, %f62
+
+	std	%f60, [%i2]
+	std	%f62, [%i2 + 0x8]
+
+	add	%i1, 16, %i1
+	subcc	%i3, 16, %i3
+	bne	cbcdec256_loop
+	add	%i2, 16, %i2
+
+	stx	%o0, [%i4]
+	stx	%o1, [%i4 + 8]
+	ret
+	restore
+
+	SET_SIZE(t4_aes256_cbc_decrypt)
+
+#endif
+
+#define	TEST_PARALLEL_CFB128_DECRYPT
+#ifdef	TEST_PARALLEL_CFB128_DECRYPT
+
+	ENTRY(t4_aes128_cfb128_decrypt)
+
+	ldd	[%o4], %f56	!IV
+	ldd	[%o4 + 8], %f58	!IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %o5
+	brz	%o5, cfb128dec_128_loop
+
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	TEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	cfb128dec_128_loop_end
+	add	%o2, 16, %o2
+
+cfb128dec_128_loop:
+	ldd	[%o1], %f6	!input
+	ldd	[%o1 + 8], %f4	!input
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f6, %f0
+	fxor	%f62, %f4, %f2
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	TEN_EROUNDS_2
+
+	ldd	[%o1], %f6	!input
+	ldd	[%o1 + 8], %f4	!input
+	ldd	[%o1 + 16], %f56	!input
+	ldd	[%o1 + 24], %f58	!input
+
+	fxor	%f60, %f6, %f6
+	fxor	%f62, %f4, %f4
+	fxor	%f0, %f56, %f60
+	fxor	%f2, %f58, %f62
+
+	std	%f6, [%o2]
+	std	%f4, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	cfb128dec_128_loop
+	add	%o2, 32, %o2
+
+cfb128dec_128_loop_end:
+	std	%f56, [%o4]
+	retl
+	std	%f58, [%o4 + 8]
+
+	SET_SIZE(t4_aes128_cfb128_decrypt)
+
+
+	ENTRY(t4_aes192_cfb128_decrypt)
+
+	ldd	[%o4], %f56	!IV
+	ldd	[%o4 + 8], %f58	!IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %o5
+	brz	%o5, cfb128dec_192_loop
+
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	TWELVE_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	cfb128dec_192_loop_end
+	add	%o2, 16, %o2
+
+cfb128dec_192_loop:
+	ldd	[%o1], %f6	!input
+	ldd	[%o1 + 8], %f4	!input
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f6, %f0
+	fxor	%f62, %f4, %f2
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	TWELVE_EROUNDS_2
+
+	ldd	[%o1], %f6	!input
+	ldd	[%o1 + 8], %f4	!input
+	ldd	[%o1 + 16], %f56	!input
+	ldd	[%o1 + 24], %f58	!input
+
+	fxor	%f60, %f6, %f6
+	fxor	%f62, %f4, %f4
+	fxor	%f0, %f56, %f60
+	fxor	%f2, %f58, %f62
+
+	std	%f6, [%o2]
+	std	%f4, [%o2 + 8]
+	std	%f60, [%o2 + 16]
+	std	%f62, [%o2 + 24]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	cfb128dec_192_loop
+	add	%o2, 32, %o2
+
+cfb128dec_192_loop_end:
+	std	%f56, [%o4]
+	retl
+	std	%f58, [%o4 + 8]
+
+	SET_SIZE(t4_aes192_cfb128_decrypt)
+
+
+	ENTRY(t4_aes256_cfb128_decrypt)
+
+	ldd	[%o4], %f56	!IV
+	ldd	[%o4 + 8], %f58	!IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+	and	%o3, 16, %o5
+	brz	%o5, cfb128dec_256_loop
+
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	FOURTEEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	be	cfb128dec_256_loop_end
+	add	%o2, 16, %o2
+
+cfb128dec_256_loop:
+	ldd	[%o1], %f20	!input
+	ldd	[%o1 + 8], %f22	!input
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f20, %f20
+	fxor	%f62, %f22, %f22
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	FOURTEEN_EROUNDS_2
+
+	ldd	[%o1 + 16], %f56	!input
+	ldd	[%o1 + 24], %f58	!input
+	fxor	%f20, %f56, %f20
+	fxor	%f22, %f58, %f22
+	std	%f20, [%o2 + 16]
+	std	%f22, [%o2 + 24]
+
+	ldd	[%o1], %f20	!input
+	ldd	[%o1 + 8], %f22	!input
+
+	fxor	%f60, %f20, %f20
+	fxor	%f62, %f22, %f22
+
+	std	%f20, [%o2]
+	std	%f22, [%o2 + 8]
+
+	add	%o1, 32, %o1
+	subcc	%o3, 32, %o3
+	bne	cfb128dec_256_loop
+	add	%o2, 32, %o2
+
+	ldd	[%o0 + 0x60], %f20
+	ldd	[%o0 + 0x68], %f22
+
+cfb128dec_256_loop_end:
+	std	%f56, [%o4]
+	retl
+	std	%f58, [%o4 + 8]
+
+	SET_SIZE(t4_aes256_cfb128_decrypt)
+
+#else
+	ENTRY(t4_aes128_cfb128_decrypt)
+
+	ldd	[%o4], %f56	!IV
+	ldd	[%o4 + 8], %f58	!IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cfb128dec_128_loop:
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	TEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cfb128dec_128_loop
+	add	%o2, 16, %o2
+
+	std	%f56, [%o4]
+	retl
+	std	%f58, [%o4 + 8]
+
+	SET_SIZE(t4_aes128_cfb128_decrypt)
+
+
+	ENTRY(t4_aes192_cfb128_decrypt)
+
+	ldd	[%o4], %f56	!IV
+	ldd	[%o4 + 8], %f58	!IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cfb128dec_192_loop:
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	TWELVE_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cfb128dec_192_loop
+	add	%o2, 16, %o2
+
+	std	%f56, [%o4]
+	retl
+	std	%f58, [%o4 + 8]
+
+	SET_SIZE(t4_aes192_cfb128_decrypt)
+
+
+	ENTRY(t4_aes256_cfb128_decrypt)
+
+	ldd	[%o4], %f56	!IV
+	ldd	[%o4 + 8], %f58	!IV
+	ldx	[%o0], %g1	! ks[0]
+	ldx	[%o0 + 8], %g2	! ks[1]
+
+cfb128dec_256_loop:
+	movxtod	%g1, %f60
+	movxtod	%g2, %f62
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	/* CFB mode uses encryption for the decrypt operation */
+	FOURTEEN_EROUNDS
+
+	ldd	[%o1], %f56	!input
+	ldd	[%o1 + 8], %f58	!input
+	fxor	%f60, %f56, %f60
+	fxor	%f62, %f58, %f62
+
+	std	%f60, [%o2]
+	std	%f62, [%o2 + 8]
+
+	add	%o1, 16, %o1
+	subcc	%o3, 16, %o3
+	bne	cfb128dec_256_loop
+	add	%o2, 16, %o2
+
+	std	%f56, [%o4]
+	retl
+	std	%f58, [%o4 + 8]
+
+	SET_SIZE(t4_aes256_cfb128_decrypt)
+
+#endif
+
+#endif	/* lint || __lint */