git: bf5069fb6a3f - stable/14 - libcrypto: Switch back to the generated assembly in sys/crypto/openssl

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 08 Sep 2023 20:56:43 UTC
The branch stable/14 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=bf5069fb6a3fc8fbf08ed23a4fd958af48cf902f

commit bf5069fb6a3fc8fbf08ed23a4fd958af48cf902f
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2023-08-29 21:46:44 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2023-09-08 20:55:44 +0000

    libcrypto: Switch back to the generated assembly in sys/crypto/openssl
    
    Reviewed by:    markj
    Differential Revision:  https://reviews.freebsd.org/D41569
    
    (cherry picked from commit 47d997021fbc7b662e9507deec1897d514d1224c)
---
 secure/lib/libcrypto/Makefile                      |     4 +-
 .../lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S  |  6390 ---------
 secure/lib/libcrypto/arch/aarch64/aesv8-armx.S     |  3181 -----
 secure/lib/libcrypto/arch/aarch64/arm64cpuid.S     |   130 -
 secure/lib/libcrypto/arch/aarch64/armv8-mont.S     |  2125 ---
 secure/lib/libcrypto/arch/aarch64/chacha-armv8.S   |  2035 ---
 .../libcrypto/arch/aarch64/ecp_nistz256-armv8.S    |  4243 ------
 secure/lib/libcrypto/arch/aarch64/ghashv8-armx.S   |   553 -
 .../lib/libcrypto/arch/aarch64/keccak1600-armv8.S  |  1010 --
 secure/lib/libcrypto/arch/aarch64/poly1305-armv8.S |   864 --
 secure/lib/libcrypto/arch/aarch64/sha1-armv8.S     |  1212 --
 secure/lib/libcrypto/arch/aarch64/sha256-armv8.S   |  2052 ---
 secure/lib/libcrypto/arch/aarch64/sha512-armv8.S   |  1607 ---
 secure/lib/libcrypto/arch/aarch64/vpaes-armv8.S    |  1197 --
 secure/lib/libcrypto/arch/amd64/aes-x86_64.S       |  2680 ----
 secure/lib/libcrypto/arch/amd64/aesni-gcm-x86_64.S |   811 --
 secure/lib/libcrypto/arch/amd64/aesni-mb-x86_64.S  |  1610 ---
 .../lib/libcrypto/arch/amd64/aesni-sha1-x86_64.S   |  3057 -----
 .../lib/libcrypto/arch/amd64/aesni-sha256-x86_64.S |  4457 ------
 secure/lib/libcrypto/arch/amd64/aesni-x86_64.S     |  4507 ------
 secure/lib/libcrypto/arch/amd64/bsaes-x86_64.S     |  2619 ----
 secure/lib/libcrypto/arch/amd64/chacha-x86_64.S    |  2215 ---
 secure/lib/libcrypto/arch/amd64/cmll-x86_64.S      |  1947 ---
 secure/lib/libcrypto/arch/amd64/e_padlock-x86_64.S |  1059 --
 .../lib/libcrypto/arch/amd64/ecp_nistz256-x86_64.S |  7365 ----------
 secure/lib/libcrypto/arch/amd64/ghash-x86_64.S     |  1875 ---
 .../lib/libcrypto/arch/amd64/keccak1600-x86_64.S   |   546 -
 secure/lib/libcrypto/arch/amd64/md5-x86_64.S       |   705 -
 secure/lib/libcrypto/arch/amd64/poly1305-x86_64.S  |  2090 ---
 secure/lib/libcrypto/arch/amd64/rc4-md5-x86_64.S   |  1303 --
 secure/lib/libcrypto/arch/amd64/rc4-x86_64.S       |   657 -
 secure/lib/libcrypto/arch/amd64/rsaz-avx2.S        |  1766 ---
 secure/lib/libcrypto/arch/amd64/rsaz-avx512.S      |   902 --
 secure/lib/libcrypto/arch/amd64/rsaz-x86_64.S      |  2037 ---
 secure/lib/libcrypto/arch/amd64/sha1-mb-x86_64.S   |  7325 ----------
 secure/lib/libcrypto/arch/amd64/sha1-x86_64.S      |  5472 --------
 secure/lib/libcrypto/arch/amd64/sha256-mb-x86_64.S |  8006 -----------
 secure/lib/libcrypto/arch/amd64/sha256-x86_64.S    |  5478 --------
 secure/lib/libcrypto/arch/amd64/sha512-x86_64.S    |  5483 --------
 secure/lib/libcrypto/arch/amd64/vpaes-x86_64.S     |   880 --
 secure/lib/libcrypto/arch/amd64/wp-x86_64.S        |   901 --
 secure/lib/libcrypto/arch/amd64/x25519-x86_64.S    |   824 --
 secure/lib/libcrypto/arch/amd64/x86_64-gf2m.S      |   333 -
 secure/lib/libcrypto/arch/amd64/x86_64-mont.S      |  1261 --
 secure/lib/libcrypto/arch/amd64/x86_64-mont5.S     |  3625 -----
 secure/lib/libcrypto/arch/amd64/x86_64cpuid.S      |   513 -
 secure/lib/libcrypto/arch/arm/aes-armv4.S          |  1198 --
 secure/lib/libcrypto/arch/arm/aesv8-armx.S         |  1088 --
 secure/lib/libcrypto/arch/arm/armv4-gf2m.S         |   236 -
 secure/lib/libcrypto/arch/arm/armv4-mont.S         |   961 --
 secure/lib/libcrypto/arch/arm/armv4cpuid.S         |   273 -
 secure/lib/libcrypto/arch/arm/bsaes-armv7.S        |  2561 ----
 secure/lib/libcrypto/arch/arm/chacha-armv4.S       |  1478 --
 secure/lib/libcrypto/arch/arm/ecp_nistz256-armv4.S |  4430 ------
 secure/lib/libcrypto/arch/arm/ghash-armv4.S        |   565 -
 secure/lib/libcrypto/arch/arm/ghashv8-armx.S       |   244 -
 secure/lib/libcrypto/arch/arm/keccak1600-armv4.S   |  2694 ----
 secure/lib/libcrypto/arch/arm/poly1305-armv4.S     |  1169 --
 secure/lib/libcrypto/arch/arm/sha1-armv4-large.S   |  1499 --
 secure/lib/libcrypto/arch/arm/sha256-armv4.S       |  2823 ----
 secure/lib/libcrypto/arch/arm/sha512-armv4.S       |  1877 ---
 secure/lib/libcrypto/arch/i386/aes-586.S           |  6644 ---------
 secure/lib/libcrypto/arch/i386/aesni-x86.S         |  6732 ---------
 secure/lib/libcrypto/arch/i386/bf-586.S            |  1928 ---
 secure/lib/libcrypto/arch/i386/bn-586.S            |  3157 -----
 secure/lib/libcrypto/arch/i386/cast-586.S          |  2002 ---
 secure/lib/libcrypto/arch/i386/chacha-x86.S        |  2084 ---
 secure/lib/libcrypto/arch/i386/cmll-x86.S          |  4896 -------
 secure/lib/libcrypto/arch/i386/co-586.S            |  2584 ----
 secure/lib/libcrypto/arch/i386/crypt586.S          |  1800 ---
 secure/lib/libcrypto/arch/i386/des-586.S           |  3932 ------
 secure/lib/libcrypto/arch/i386/e_padlock-x86.S     |  2300 ----
 secure/lib/libcrypto/arch/i386/ecp_nistz256-x86.S  | 10584 --------------
 secure/lib/libcrypto/arch/i386/ghash-x86.S         |  2636 ----
 secure/lib/libcrypto/arch/i386/md5-586.S           |  1404 --
 secure/lib/libcrypto/arch/i386/poly1305-x86.S      |  3938 ------
 secure/lib/libcrypto/arch/i386/rc4-586.S           |   819 --
 secure/lib/libcrypto/arch/i386/rc5-586.S           |  1264 --
 secure/lib/libcrypto/arch/i386/rmd-586.S           |  3976 ------
 secure/lib/libcrypto/arch/i386/sha1-586.S          |  8016 -----------
 secure/lib/libcrypto/arch/i386/sha256-586.S        | 13612 -------------------
 secure/lib/libcrypto/arch/i386/sha512-586.S        |  5704 --------
 secure/lib/libcrypto/arch/i386/vpaes-x86.S         |  1488 --
 secure/lib/libcrypto/arch/i386/wp-mmx.S            |  2260 ---
 secure/lib/libcrypto/arch/i386/x86-gf2m.S          |   755 -
 secure/lib/libcrypto/arch/i386/x86-mont.S          |   995 --
 secure/lib/libcrypto/arch/i386/x86cpuid.S          |  1217 --
 secure/lib/libcrypto/arch/powerpc/aes-ppc.S        |  1561 ---
 secure/lib/libcrypto/arch/powerpc/aesp8-ppc.S      |  3642 -----
 secure/lib/libcrypto/arch/powerpc/bn-ppc.S         |  1855 ---
 secure/lib/libcrypto/arch/powerpc/chacha-ppc.S     |  1492 --
 secure/lib/libcrypto/arch/powerpc/ghashp8-ppc.S    |   569 -
 secure/lib/libcrypto/arch/powerpc/poly1305-ppc.S   |  1301 --
 secure/lib/libcrypto/arch/powerpc/poly1305-ppcfp.S |   586 -
 secure/lib/libcrypto/arch/powerpc/ppc-mont.S       |  1787 ---
 secure/lib/libcrypto/arch/powerpc/ppc.S            |  1855 ---
 secure/lib/libcrypto/arch/powerpc/ppccpuid.S       |   356 -
 secure/lib/libcrypto/arch/powerpc/sha1-ppc.S       |  1118 --
 secure/lib/libcrypto/arch/powerpc/sha256-ppc.S     |  1321 --
 secure/lib/libcrypto/arch/powerpc/sha256p8-ppc.S   |   735 -
 secure/lib/libcrypto/arch/powerpc/sha512-ppc.S     |  3071 -----
 secure/lib/libcrypto/arch/powerpc/sha512p8-ppc.S   |   833 --
 secure/lib/libcrypto/arch/powerpc/vpaes-ppc.S      |  1468 --
 secure/lib/libcrypto/arch/powerpc64/aes-ppc.S      |  1533 ---
 secure/lib/libcrypto/arch/powerpc64/aesp8-ppc.S    |  3659 -----
 secure/lib/libcrypto/arch/powerpc64/bn-ppc.S       |  1876 ---
 secure/lib/libcrypto/arch/powerpc64/chacha-ppc.S   |  1499 --
 .../libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S  |   354 -
 .../libcrypto/arch/powerpc64/ecp_nistz256-ppc64.S  |  4854 -------
 secure/lib/libcrypto/arch/powerpc64/ghashp8-ppc.S  |   576 -
 .../libcrypto/arch/powerpc64/keccak1600-ppc64.S    |   670 -
 secure/lib/libcrypto/arch/powerpc64/poly1305-ppc.S |  1142 --
 .../lib/libcrypto/arch/powerpc64/poly1305-ppcfp.S  |   596 -
 secure/lib/libcrypto/arch/powerpc64/ppc-mont.S     |  1790 ---
 secure/lib/libcrypto/arch/powerpc64/ppc.S          |  1876 ---
 secure/lib/libcrypto/arch/powerpc64/ppccpuid.S     |   387 -
 secure/lib/libcrypto/arch/powerpc64/sha1-ppc.S     |  1121 --
 secure/lib/libcrypto/arch/powerpc64/sha256-ppc.S   |  1324 --
 secure/lib/libcrypto/arch/powerpc64/sha256p8-ppc.S |   738 -
 secure/lib/libcrypto/arch/powerpc64/sha512-ppc.S   |  1420 --
 secure/lib/libcrypto/arch/powerpc64/sha512p8-ppc.S |   836 --
 secure/lib/libcrypto/arch/powerpc64/vpaes-ppc.S    |  1479 --
 secure/lib/libcrypto/arch/powerpc64/x25519-ppc64.S |   349 -
 secure/lib/libcrypto/arch/powerpc64le/aes-ppc.S    |  1581 ---
 secure/lib/libcrypto/arch/powerpc64le/aesp8-ppc.S  |  3659 -----
 secure/lib/libcrypto/arch/powerpc64le/bn-ppc.S     |  1876 ---
 secure/lib/libcrypto/arch/powerpc64le/chacha-ppc.S |  1371 --
 .../arch/powerpc64le/ecp_nistp521-ppc64.S          |   354 -
 .../arch/powerpc64le/ecp_nistz256-ppc64.S          |  4854 -------
 .../lib/libcrypto/arch/powerpc64le/ghashp8-ppc.S   |   576 -
 .../libcrypto/arch/powerpc64le/keccak1600-ppc64.S  |   670 -
 .../lib/libcrypto/arch/powerpc64le/poly1305-ppc.S  |  1128 --
 .../libcrypto/arch/powerpc64le/poly1305-ppcfp.S    |   591 -
 secure/lib/libcrypto/arch/powerpc64le/ppc-mont.S   |  1790 ---
 secure/lib/libcrypto/arch/powerpc64le/ppc.S        |  1876 ---
 secure/lib/libcrypto/arch/powerpc64le/ppccpuid.S   |   387 -
 secure/lib/libcrypto/arch/powerpc64le/sha1-ppc.S   |  1169 --
 secure/lib/libcrypto/arch/powerpc64le/sha256-ppc.S |  1372 --
 .../lib/libcrypto/arch/powerpc64le/sha256p8-ppc.S  |   746 -
 secure/lib/libcrypto/arch/powerpc64le/sha512-ppc.S |  1516 ---
 .../lib/libcrypto/arch/powerpc64le/sha512p8-ppc.S  |   848 --
 secure/lib/libcrypto/arch/powerpc64le/vpaes-ppc.S  |  1479 --
 .../lib/libcrypto/arch/powerpc64le/x25519-ppc64.S  |   349 -
 secure/lib/libcrypto/engines/padlock/Makefile      |     2 +-
 secure/lib/libcrypto/modules/fips/Makefile         |     4 +-
 145 files changed, 5 insertions(+), 310557 deletions(-)

diff --git a/secure/lib/libcrypto/Makefile b/secure/lib/libcrypto/Makefile
index ab9044ad67f9..585e89861815 100644
--- a/secure/lib/libcrypto/Makefile
+++ b/secure/lib/libcrypto/Makefile
@@ -618,12 +618,12 @@ buildasm cleanasm:
 PICFLAG+=	-DOPENSSL_PIC
 
 .if defined(ASM_${MACHINE_CPUARCH})
-.PATH:	${SRCTOP}/secure/lib/libcrypto/arch/${MACHINE_CPUARCH}
+.PATH:	${SRCTOP}/sys/crypto/openssl/${MACHINE_CPUARCH}
 .if defined(ASM_amd64)
 .PATH:	${LCRYPTO_SRC}/crypto/bn/asm
 .endif
 .elif defined(ASM_${MACHINE_ARCH})
-.PATH:	${SRCTOP}/secure/lib/libcrypto/arch/${MACHINE_ARCH}
+.PATH:	${SRCTOP}/sys/crypto/openssl/${MACHINE_ARCH}
 .endif
 
 .PATH:	${LCRYPTO_SRC}/crypto \
diff --git a/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S b/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S
deleted file mode 100644
index eb85dbc9f996..000000000000
--- a/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S
+++ /dev/null
@@ -1,6390 +0,0 @@
-/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
-#include "arm_arch.h"
-
-#if __ARM_MAX_ARCH__>=8
-.arch	armv8-a+crypto
-.text
-.globl	aes_gcm_enc_128_kernel
-.type	aes_gcm_enc_128_kernel,%function
-.align	4
-aes_gcm_enc_128_kernel:
-	cbz	x1, .L128_enc_ret
-	stp	x19, x20, [sp, #-112]!
-	mov	x16, x4
-	mov	x8, x5
-	stp	x21, x22, [sp, #16]
-	stp	x23, x24, [sp, #32]
-	stp	d8, d9, [sp, #48]
-	stp	d10, d11, [sp, #64]
-	stp	d12, d13, [sp, #80]
-	stp	d14, d15, [sp, #96]
-
-	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
-#ifdef __AARCH64EB__
-	rev	x10, x10
-	rev	x11, x11
-#endif
-	ldp	x13, x14, [x8, #160]                     //load rk10
-#ifdef __AARCH64EB__
-	ror	x13, x13, #32
-	ror	x14, x14, #32
-#endif
-	ld1	{v11.16b}, [x3]
-	ext	v11.16b, v11.16b, v11.16b, #8
-	rev64	v11.16b, v11.16b
-	lsr	x5, x1, #3              //byte_len
-	mov	x15, x5
-
-	ld1	{v18.4s}, [x8], #16								  //load rk0
-	add	x4, x0, x1, lsr #3   //end_input_ptr
-	sub	x5, x5, #1      //byte_len - 1
-
-	lsr	x12, x11, #32
-	ldr	q15, [x3, #112]                        //load h4l | h4h
-#ifndef __AARCH64EB__
-	ext	v15.16b, v15.16b, v15.16b, #8
-#endif
-	fmov	d1, x10                               //CTR block 1
-	rev	w12, w12                                //rev_ctr32
-
-	add	w12, w12, #1                            //increment rev_ctr32
-	orr	w11, w11, w11
-	ld1	{v19.4s}, [x8], #16								  //load rk1
-
-	rev	w9, w12                                 //CTR block 1
-	add	w12, w12, #1                            //CTR block 1
-	fmov	d3, x10                               //CTR block 3
-
-	orr	x9, x11, x9, lsl #32            //CTR block 1
-	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
-
-	fmov	v1.d[1], x9                               //CTR block 1
-	rev	w9, w12                                 //CTR block 2
-
-	fmov	d2, x10                               //CTR block 2
-	orr	x9, x11, x9, lsl #32            //CTR block 2
-	add	w12, w12, #1                            //CTR block 2
-
-	fmov	v2.d[1], x9                               //CTR block 2
-	rev	w9, w12                                 //CTR block 3
-
-	orr	x9, x11, x9, lsl #32            //CTR block 3
-	ld1	{v20.4s}, [x8], #16								  //load rk2
-
-	add	w12, w12, #1                            //CTR block 3
-	fmov	v3.d[1], x9                               //CTR block 3
-
-	ldr	q14, [x3, #80]                         //load h3l | h3h
-#ifndef __AARCH64EB__
-	ext	v14.16b, v14.16b, v14.16b, #8
-#endif
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
-	ld1	{v21.4s}, [x8], #16								  //load rk3
-
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
-	ldr	q12, [x3, #32]                         //load h1l | h1h
-#ifndef __AARCH64EB__
-	ext	v12.16b, v12.16b, v12.16b, #8
-#endif
-
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
-	ld1	{v22.4s}, [x8], #16								  //load rk4
-
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
-	ld1	{v23.4s}, [x8], #16								  //load rk5
-
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
-	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
-
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
-	ld1	{v24.4s}, [x8], #16								  //load rk6
-
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
-	ld1	{v25.4s}, [x8], #16								  //load rk7
-
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
-	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
-
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
-	ld1	{v26.4s}, [x8], #16								  //load rk8
-
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
-	ldr	q13, [x3, #64]                         //load h2l | h2h
-#ifndef __AARCH64EB__
-	ext	v13.16b, v13.16b, v13.16b, #8
-#endif
-
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
-
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
-	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
-
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
-
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
-
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
-	ld1	{v27.4s}, [x8], #16								  //load rk9
-
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
-
-	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
-	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
-
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
-	add	x5, x5, x0
-
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
-	cmp	x0, x5                   //check if we have <= 4 blocks
-
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
-
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
-
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
-
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
-
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
-
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
-
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
-	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
-
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
-
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
-
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
-
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
-
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
-
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
-
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
-
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
-
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
-
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
-
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
-
-	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
-
-	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
-
-	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
-
-	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
-
-	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
-	b.ge	.L128_enc_tail                                    //handle tail
-
-	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x6, x6
-	rev	x7, x7
-#endif
-	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x21, x21
-	rev	x22, x22
-#endif
-	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x19, x19
-	rev	x20, x20
-#endif
-	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x23, x23
-	rev	x24, x24
-#endif
-	eor	x6, x6, x13                     //AES block 0 - round 10 low
-	eor	x7, x7, x14                     //AES block 0 - round 10 high
-
-	eor	x21, x21, x13                     //AES block 2 - round 10 low
-	fmov	d4, x6                               //AES block 0 - mov low
-
-	eor	x19, x19, x13                     //AES block 1 - round 10 low
-	eor	x22, x22, x14                     //AES block 2 - round 10 high
-	fmov	v4.d[1], x7                           //AES block 0 - mov high
-
-	fmov	d5, x19                               //AES block 1 - mov low
-	eor	x20, x20, x14                     //AES block 1 - round 10 high
-
-	eor	x23, x23, x13                     //AES block 3 - round 10 low
-	fmov	v5.d[1], x20                           //AES block 1 - mov high
-
-	fmov	d6, x21                               //AES block 2 - mov low
-	eor	x24, x24, x14                     //AES block 3 - round 10 high
-	rev	w9, w12                                 //CTR block 4
-
-	fmov	v6.d[1], x22                           //AES block 2 - mov high
-	orr	x9, x11, x9, lsl #32            //CTR block 4
-
-	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
-	fmov	d0, x10                               //CTR block 4
-	add	w12, w12, #1                            //CTR block 4
-
-	fmov	v0.d[1], x9                               //CTR block 4
-	rev	w9, w12                                 //CTR block 5
-
-	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
-	fmov	d1, x10                               //CTR block 5
-	orr	x9, x11, x9, lsl #32            //CTR block 5
-
-	add	w12, w12, #1                            //CTR block 5
-	add	x0, x0, #64                       //AES input_ptr update
-	fmov	v1.d[1], x9                               //CTR block 5
-
-	fmov	d7, x23                               //AES block 3 - mov low
-	rev	w9, w12                                 //CTR block 6
-	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
-
-	fmov	v7.d[1], x24                           //AES block 3 - mov high
-	orr	x9, x11, x9, lsl #32            //CTR block 6
-
-	add	w12, w12, #1                            //CTR block 6
-	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
-	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
-
-	fmov	d2, x10                               //CTR block 6
-	cmp	x0, x5                   //check if we have <= 8 blocks
-
-	fmov	v2.d[1], x9                               //CTR block 6
-	rev	w9, w12                                 //CTR block 7
-	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
-
-	orr	x9, x11, x9, lsl #32            //CTR block 7
-
-	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
-	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
-	b.ge	.L128_enc_prepretail                              //do prepretail
-
-.L128_enc_main_loop:	//main	loop start
-	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x23, x23
-	rev	x24, x24
-#endif
-	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
-	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
-
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
-	fmov	d3, x10                               //CTR block 4k+3
-
-	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
-	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
-
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
-	add	w12, w12, #1                            //CTR block 4k+3
-	fmov	v3.d[1], x9                               //CTR block 4k+3
-
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
-	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
-
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
-	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
-
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
-	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
-
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
-	eor	x24, x24, x14                     //AES block 4k+3 - round 10 high
-
-	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
-	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
-	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x6, x6
-	rev	x7, x7
-#endif
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
-	rev	w9, w12                                 //CTR block 4k+8
-
-	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
-	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
-	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
-
-	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
-	add	w12, w12, #1                            //CTR block 4k+8
-	mov	d10, v17.d[1]                               //GHASH block 4k - mid
-
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
-
-	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
-	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
-
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
-
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
-	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
-
-	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
-
-	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
-	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
-
-	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
-
-	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
-	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
-
-	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
-	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
-
-	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
-	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
-
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
-	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
-
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
-	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
-
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
-	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
-
-	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
-
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
-	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
-
-	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
-
-	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
-	movi	v8.8b, #0xc2
-
-	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
-	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
-
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
-
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
-	shl	d8, d8, #56               //mod_constant
-
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
-	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
-
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
-	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x19, x19
-	rev	x20, x20
-#endif
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
-	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
-
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
-	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x21, x21
-	rev	x22, x22
-#endif
-	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
-	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
-
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
-	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
-
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
-	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
-
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
-	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
-
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
-	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
-
-	fmov	d4, x6                               //AES block 4k+4 - mov low
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
-	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
-
-	add	x0, x0, #64                       //AES input_ptr update
-	fmov	d7, x23                               //AES block 4k+3 - mov low
-	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
-
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
-	fmov	d5, x19                               //AES block 4k+5 - mov low
-
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
-	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
-
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
-	eor	x20, x20, x14                     //AES block 4k+5 - round 10 high
-
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
-	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
-
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
-	fmov	v7.d[1], x24                           //AES block 4k+3 - mov high
-
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
-	cmp	x0, x5                   //.LOOP CONTROL
-
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
-	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
-
-	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
-	eor	x21, x21, x13                     //AES block 4k+6 - round 10 low
-	eor	x22, x22, x14                     //AES block 4k+6 - round 10 high
-
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
-	fmov	d6, x21                               //AES block 4k+6 - mov low
-
-	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
-	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
-
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
-	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
-
-	fmov	d0, x10                               //CTR block 4k+8
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
-
-	fmov	v0.d[1], x9                               //CTR block 4k+8
-	rev	w9, w12                                 //CTR block 4k+9
-	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
-
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
-	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
-
-	add	w12, w12, #1                            //CTR block 4k+9
-	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
-	fmov	d1, x10                               //CTR block 4k+9
-
-	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
-	fmov	v1.d[1], x9                               //CTR block 4k+9
-	rev	w9, w12                                 //CTR block 4k+10
-
-	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
-	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
-	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
-	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
-
-	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
-	add	w12, w12, #1                            //CTR block 4k+10
-	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
-	fmov	d2, x10                               //CTR block 4k+10
-
-	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
-	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
-
-	fmov	v2.d[1], x9                               //CTR block 4k+10
-	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
-	rev	w9, w12                                 //CTR block 4k+11
-
-	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
-	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+3 - result
-
-	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
-	st1	{ v7.16b}, [x2], #16                     //AES block 4k+3 - store result
-	b.lt	.L128_enc_main_loop
-
-.L128_enc_prepretail:	//PREPRETAIL
-	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
-	fmov	d3, x10                               //CTR block 4k+3
-	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
-
-	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
-	add	w12, w12, #1                            //CTR block 4k+3
-	fmov	v3.d[1], x9                               //CTR block 4k+3
-
-	aese	v1.16b, v18.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
-	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
-
-	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
-
-	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
-	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
-
-	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
-
-	aese	v3.16b, v18.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
-	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
-
-	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
-	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
-
-	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
-	mov	d10, v17.d[1]                               //GHASH block 4k - mid
-
-	aese	v1.16b, v19.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
-	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
-
-	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
-
-	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
-	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
-
-	aese	v3.16b, v19.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
-
-	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
-	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
-
-	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
-
-	aese	v0.16b, v18.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
-	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
-
-	aese	v2.16b, v18.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
-
-	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
-	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
-
-	aese	v0.16b, v19.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
-	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
-
-	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
-
-	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
-	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
-
-	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
-
-	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
-
-	aese	v2.16b, v19.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
-	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
-
-	aese	v0.16b, v20.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
-
-	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
-	movi	v8.8b, #0xc2
-
-	aese	v2.16b, v20.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
-	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
-
-	aese	v3.16b, v20.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
-
-	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
-	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
-
-	aese	v2.16b, v21.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
-
-	aese	v1.16b, v20.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
-	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
-
-	aese	v0.16b, v21.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
-
-	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
-	shl	d8, d8, #56               //mod_constant
-
-	aese	v1.16b, v21.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
-	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
-
-	aese	v0.16b, v22.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
-
-	pmull	v28.1q, v9.1d, v8.1d
-	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
-
-	aese	v1.16b, v22.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
-
-	aese	v0.16b, v23.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
-	ext	v9.16b, v9.16b, v9.16b, #8
-
-	aese	v3.16b, v21.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
-
-	aese	v2.16b, v22.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
-	eor	v10.16b, v10.16b, v11.16b
-
-	aese	v0.16b, v24.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
-
-	aese	v3.16b, v22.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
-
-	aese	v1.16b, v23.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
-
-	aese	v2.16b, v23.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
-	eor	v10.16b, v10.16b, v28.16b
-
-	aese	v3.16b, v23.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
-
-	aese	v1.16b, v24.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
-
-	aese	v2.16b, v24.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
-
-	aese	v3.16b, v24.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
-	eor	v10.16b, v10.16b, v9.16b
-
-	aese	v0.16b, v25.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
-
-	aese	v2.16b, v25.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
-
-	aese	v3.16b, v25.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
-
-	pmull	v28.1q, v10.1d, v8.1d
-
-	aese	v1.16b, v25.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
-	ext	v10.16b, v10.16b, v10.16b, #8
-
-	aese	v3.16b, v26.16b
-	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
-
-	aese	v0.16b, v26.16b
-	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
-	eor	v11.16b, v11.16b, v28.16b
-
-	aese	v1.16b, v26.16b
-	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
-
-	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
-
-	aese	v2.16b, v26.16b
-	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
-
-	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
-
-	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
-	eor	v11.16b, v11.16b, v10.16b
-
-	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
-.L128_enc_tail:	//TAIL
-
-	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
-	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
-#ifdef __AARCH64EB__
-	rev	x6, x6
-	rev	x7, x7
-#endif
-	cmp	x5, #48
-
-	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
-	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
-	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
-
-	fmov	d4, x6                               //AES block 4k+4 - mov low
-
-	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
-
-	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
-
-	b.gt	.L128_enc_blocks_more_than_3
-
-	sub	w12, w12, #1
-	movi	v11.8b, #0
-	mov	v3.16b, v2.16b
-
-	cmp	x5, #32
-	mov	v2.16b, v1.16b
-	movi	v9.8b, #0
-
-	movi	v10.8b, #0
-	b.gt	.L128_enc_blocks_more_than_2
-
-	mov	v3.16b, v1.16b
-	cmp	x5, #16
-
-	sub	w12, w12, #1
-	b.gt	.L128_enc_blocks_more_than_1
-
-	sub	w12, w12, #1
-	b	.L128_enc_blocks_less_than_1
-.L128_enc_blocks_more_than_3:	//blocks	left >  3
-	st1	{ v5.16b}, [x2], #16                     //AES final-3 block  - store result
-
-	ldp	x6, x7, [x0], #16           //AES final-2 block - load input low & high
-#ifdef __AARCH64EB__
-	rev	x6, x6
-	rev	x7, x7
*** 310632 LINES SKIPPED ***