git: 4f3a6a07112b - releng/14.0 - ossl: Update the generated assembly files from OpenSSL 3.0.

From: Ed Maste <emaste_at_FreeBSD.org>
Date: Wed, 25 Oct 2023 20:06:28 UTC
The branch releng/14.0 has been updated by emaste:

URL: https://cgit.FreeBSD.org/src/commit/?id=4f3a6a07112b4f4f3a04ee9a7de01598c0b1d30f

commit 4f3a6a07112b4f4f3a04ee9a7de01598c0b1d30f
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2023-08-29 21:44:15 +0000
Commit:     Ed Maste <emaste@FreeBSD.org>
CommitDate: 2023-10-25 19:56:23 +0000

    ossl: Update the generated assembly files from OpenSSL 3.0.
    
    Tested with:    cryptocheck -d ossl0 -a all -z on amd64
    Reviewed by:    markj
    Differential Revision:  https://reviews.freebsd.org/D41568
    
    (cherry picked from commit c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
    (cherry picked from commit f0d83d53c3be75ffc7711ba8171af9b934459810)
    
    Approved by:    re (gjb)
---
 sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S      | 6390 +++++++++++++++++++
 sys/crypto/openssl/aarch64/aesv8-armx.S            | 3014 ++++++++-
 sys/crypto/openssl/aarch64/arm64cpuid.S            |    7 +
 sys/crypto/openssl/aarch64/armv8-mont.S            |  732 ++-
 sys/crypto/openssl/aarch64/chacha-armv8.S          | 1553 ++---
 sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S    |    8 +-
 sys/crypto/openssl/aarch64/ghashv8-armx.S          |    1 +
 sys/crypto/openssl/aarch64/keccak1600-armv8.S      |  190 +-
 sys/crypto/openssl/aarch64/poly1305-armv8.S        |   31 +-
 sys/crypto/openssl/aarch64/sha1-armv8.S            |   54 +-
 sys/crypto/openssl/aarch64/sha256-armv8.S          |   28 +-
 sys/crypto/openssl/aarch64/sha512-armv8.S          |   28 +-
 sys/crypto/openssl/aarch64/vpaes-armv8.S           |  276 +-
 sys/crypto/openssl/amd64/aes-x86_64.S              | 2680 ++++++++
 sys/crypto/openssl/amd64/aesni-gcm-x86_64.S        |   21 +
 sys/crypto/openssl/amd64/aesni-mb-x86_64.S         |  102 +
 sys/crypto/openssl/amd64/aesni-sha1-x86_64.S       |   21 +
 sys/crypto/openssl/amd64/aesni-sha256-x86_64.S     |   21 +
 sys/crypto/openssl/amd64/aesni-x86_64.S            |   32 +
 sys/crypto/openssl/amd64/bsaes-x86_64.S            | 2619 ++++++++
 sys/crypto/openssl/amd64/chacha-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/cmll-x86_64.S             |   22 +
 sys/crypto/openssl/amd64/e_padlock-x86_64.S        |   21 +
 sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S     |   21 +
 sys/crypto/openssl/amd64/ghash-x86_64.S            |   27 +
 sys/crypto/openssl/amd64/keccak1600-x86_64.S       |   21 +
 sys/crypto/openssl/amd64/md5-x86_64.S              |   29 +-
 sys/crypto/openssl/amd64/poly1305-x86_64.S         |   21 +
 sys/crypto/openssl/amd64/rc4-md5-x86_64.S          |   21 +
 sys/crypto/openssl/amd64/rc4-x86_64.S              |   24 +
 sys/crypto/openssl/amd64/rsaz-avx2.S               |   21 +
 sys/crypto/openssl/amd64/rsaz-avx512.S             |  902 +++
 sys/crypto/openssl/amd64/rsaz-x86_64.S             |   21 +
 sys/crypto/openssl/amd64/sha1-mb-x86_64.S          |   57 +
 sys/crypto/openssl/amd64/sha1-x86_64.S             |   21 +
 sys/crypto/openssl/amd64/sha256-mb-x86_64.S        |   57 +
 sys/crypto/openssl/amd64/sha256-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/sha512-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/vpaes-x86_64.S            |   26 +
 sys/crypto/openssl/amd64/wp-x86_64.S               |   21 +
 sys/crypto/openssl/amd64/x25519-x86_64.S           |   21 +
 sys/crypto/openssl/amd64/x86_64-gf2m.S             |   21 +
 sys/crypto/openssl/amd64/x86_64-mont.S             |   21 +
 sys/crypto/openssl/amd64/x86_64-mont5.S            |   21 +
 sys/crypto/openssl/amd64/x86_64cpuid.S             |   49 +
 sys/crypto/openssl/arm/aes-armv4.S                 |    7 +-
 sys/crypto/openssl/arm/aesv8-armx.S                |  776 ++-
 sys/crypto/openssl/arm/armv4-gf2m.S                |   13 +-
 sys/crypto/openssl/arm/armv4-mont.S                |   17 +-
 sys/crypto/openssl/arm/armv4cpuid.S                |    3 +-
 sys/crypto/openssl/arm/bsaes-armv7.S               |   47 +-
 sys/crypto/openssl/arm/chacha-armv4.S              |   11 +-
 sys/crypto/openssl/arm/ecp_nistz256-armv4.S        |    4 +-
 sys/crypto/openssl/arm/ghash-armv4.S               |    3 +-
 sys/crypto/openssl/arm/ghashv8-armx.S              |   64 +-
 sys/crypto/openssl/arm/keccak1600-armv4.S          |   34 +-
 sys/crypto/openssl/arm/poly1305-armv4.S            |   37 +-
 sys/crypto/openssl/arm/sha1-armv4-large.S          |   15 +-
 sys/crypto/openssl/arm/sha256-armv4.S              |   17 +-
 sys/crypto/openssl/arm/sha512-armv4.S              |   15 +-
 sys/crypto/openssl/i386/aes-586.S                  | 6644 ++++++++++++++++++++
 sys/crypto/openssl/i386/aesni-x86.S                |  254 +
 sys/crypto/openssl/i386/bf-586.S                   |  134 +
 sys/crypto/openssl/i386/bn-586.S                   |  104 +
 sys/crypto/openssl/i386/cast-586.S                 |  134 +
 sys/crypto/openssl/i386/chacha-x86.S               |   64 +
 sys/crypto/openssl/i386/cmll-x86.S                 |  144 +
 sys/crypto/openssl/i386/co-586.S                   |   74 +
 sys/crypto/openssl/i386/crypt586.S                 |   44 +
 sys/crypto/openssl/i386/des-586.S                  |  254 +
 sys/crypto/openssl/i386/e_padlock-x86.S            |  214 +
 sys/crypto/openssl/i386/ecp_nistz256-x86.S         |  254 +
 sys/crypto/openssl/i386/ghash-x86.S                |  104 +
 sys/crypto/openssl/i386/md5-586.S                  |   64 +-
 sys/crypto/openssl/i386/poly1305-x86.S             |  114 +
 sys/crypto/openssl/i386/rc4-586.S                  |   64 +
 sys/crypto/openssl/i386/rc5-586.S                  |  134 +
 sys/crypto/openssl/i386/rmd-586.S                  |   44 +
 sys/crypto/openssl/i386/sha1-586.S                 |   74 +
 sys/crypto/openssl/i386/sha256-586.S               |   44 +
 sys/crypto/openssl/i386/sha512-586.S               |   44 +
 sys/crypto/openssl/i386/vpaes-x86.S                |  164 +
 sys/crypto/openssl/i386/wp-mmx.S                   |   44 +
 sys/crypto/openssl/i386/x86-gf2m.S                 |   64 +
 sys/crypto/openssl/i386/x86-mont.S                 |   44 +
 sys/crypto/openssl/i386/x86cpuid.S                 |  154 +
 sys/crypto/openssl/powerpc/bn-ppc.S                | 1855 ++++++
 sys/crypto/openssl/powerpc/poly1305-ppc.S          | 1091 +++-
 sys/crypto/openssl/powerpc/vpaes-ppc.S             |   14 +-
 sys/crypto/openssl/powerpc64/bn-ppc.S              | 1876 ++++++
 sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S  |  354 ++
 sys/crypto/openssl/powerpc64/keccak1600-ppc64.S    |   32 +-
 sys/crypto/openssl/powerpc64/poly1305-ppc.S        | 1011 ++-
 sys/crypto/openssl/powerpc64/vpaes-ppc.S           |   14 +-
 sys/crypto/openssl/powerpc64le/bn-ppc.S            | 1876 ++++++
 .../openssl/powerpc64le/ecp_nistp521-ppc64.S       |  354 ++
 sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S  |   32 +-
 sys/crypto/openssl/powerpc64le/poly1305-ppc.S      | 1002 ++-
 sys/crypto/openssl/powerpc64le/vpaes-ppc.S         |   14 +-
 99 files changed, 37489 insertions(+), 1910 deletions(-)

diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
new file mode 100644
index 000000000000..eb85dbc9f996
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
@@ -0,0 +1,6390 @@
+/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+.arch	armv8-a+crypto
+.text
+.globl	aes_gcm_enc_128_kernel
+.type	aes_gcm_enc_128_kernel,%function
+.align	4
+aes_gcm_enc_128_kernel:
+	cbz	x1, .L128_enc_ret
+	stp	x19, x20, [sp, #-112]!
+	mov	x16, x4
+	mov	x8, x5
+	stp	x21, x22, [sp, #16]
+	stp	x23, x24, [sp, #32]
+	stp	d8, d9, [sp, #48]
+	stp	d10, d11, [sp, #64]
+	stp	d12, d13, [sp, #80]
+	stp	d14, d15, [sp, #96]
+
+	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+	rev	x10, x10
+	rev	x11, x11
+#endif
+	ldp	x13, x14, [x8, #160]                     //load rk10
+#ifdef __AARCH64EB__
+	ror	x13, x13, #32
+	ror	x14, x14, #32
+#endif
+	ld1	{v11.16b}, [x3]
+	ext	v11.16b, v11.16b, v11.16b, #8
+	rev64	v11.16b, v11.16b
+	lsr	x5, x1, #3              //byte_len
+	mov	x15, x5
+
+	ld1	{v18.4s}, [x8], #16								  //load rk0
+	add	x4, x0, x1, lsr #3   //end_input_ptr
+	sub	x5, x5, #1      //byte_len - 1
+
+	lsr	x12, x11, #32
+	ldr	q15, [x3, #112]                        //load h4l | h4h
+#ifndef __AARCH64EB__
+	ext	v15.16b, v15.16b, v15.16b, #8
+#endif
+	fmov	d1, x10                               //CTR block 1
+	rev	w12, w12                                //rev_ctr32
+
+	add	w12, w12, #1                            //increment rev_ctr32
+	orr	w11, w11, w11
+	ld1	{v19.4s}, [x8], #16								  //load rk1
+
+	rev	w9, w12                                 //CTR block 1
+	add	w12, w12, #1                            //CTR block 1
+	fmov	d3, x10                               //CTR block 3
+
+	orr	x9, x11, x9, lsl #32            //CTR block 1
+	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
+
+	fmov	v1.d[1], x9                               //CTR block 1
+	rev	w9, w12                                 //CTR block 2
+
+	fmov	d2, x10                               //CTR block 2
+	orr	x9, x11, x9, lsl #32            //CTR block 2
+	add	w12, w12, #1                            //CTR block 2
+
+	fmov	v2.d[1], x9                               //CTR block 2
+	rev	w9, w12                                 //CTR block 3
+
+	orr	x9, x11, x9, lsl #32            //CTR block 3
+	ld1	{v20.4s}, [x8], #16								  //load rk2
+
+	add	w12, w12, #1                            //CTR block 3
+	fmov	v3.d[1], x9                               //CTR block 3
+
+	ldr	q14, [x3, #80]                         //load h3l | h3h
+#ifndef __AARCH64EB__
+	ext	v14.16b, v14.16b, v14.16b, #8
+#endif
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
+	ld1	{v21.4s}, [x8], #16								  //load rk3
+
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
+	ldr	q12, [x3, #32]                         //load h1l | h1h
+#ifndef __AARCH64EB__
+	ext	v12.16b, v12.16b, v12.16b, #8
+#endif
+
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
+	ld1	{v22.4s}, [x8], #16								  //load rk4
+
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
+	ld1	{v23.4s}, [x8], #16								  //load rk5
+
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
+	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
+
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
+	ld1	{v24.4s}, [x8], #16								  //load rk6
+
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
+	ld1	{v25.4s}, [x8], #16								  //load rk7
+
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
+	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
+
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
+	ld1	{v26.4s}, [x8], #16								  //load rk8
+
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
+	ldr	q13, [x3, #64]                         //load h2l | h2h
+#ifndef __AARCH64EB__
+	ext	v13.16b, v13.16b, v13.16b, #8
+#endif
+
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
+
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
+	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
+
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
+
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
+
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
+	ld1	{v27.4s}, [x8], #16								  //load rk9
+
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
+
+	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
+
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
+	add	x5, x5, x0
+
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
+	cmp	x0, x5                   //check if we have <= 4 blocks
+
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
+
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
+
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
+
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
+
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
+
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
+
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
+	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
+
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
+
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
+
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
+
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
+
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
+
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
+
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
+
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
+
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
+
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
+
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
+
+	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
+
+	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
+
+	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
+
+	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
+
+	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
+	b.ge	.L128_enc_tail                                    //handle tail
+
+	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x6, x6
+	rev	x7, x7
+#endif
+	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x21, x21
+	rev	x22, x22
+#endif
+	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x19, x19
+	rev	x20, x20
+#endif
+	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x23, x23
+	rev	x24, x24
+#endif
+	eor	x6, x6, x13                     //AES block 0 - round 10 low
+	eor	x7, x7, x14                     //AES block 0 - round 10 high
+
+	eor	x21, x21, x13                     //AES block 2 - round 10 low
+	fmov	d4, x6                               //AES block 0 - mov low
+
+	eor	x19, x19, x13                     //AES block 1 - round 10 low
+	eor	x22, x22, x14                     //AES block 2 - round 10 high
+	fmov	v4.d[1], x7                           //AES block 0 - mov high
+
+	fmov	d5, x19                               //AES block 1 - mov low
+	eor	x20, x20, x14                     //AES block 1 - round 10 high
+
+	eor	x23, x23, x13                     //AES block 3 - round 10 low
+	fmov	v5.d[1], x20                           //AES block 1 - mov high
+
+	fmov	d6, x21                               //AES block 2 - mov low
+	eor	x24, x24, x14                     //AES block 3 - round 10 high
+	rev	w9, w12                                 //CTR block 4
+
+	fmov	v6.d[1], x22                           //AES block 2 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 4
+
+	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
+	fmov	d0, x10                               //CTR block 4
+	add	w12, w12, #1                            //CTR block 4
+
+	fmov	v0.d[1], x9                               //CTR block 4
+	rev	w9, w12                                 //CTR block 5
+
+	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
+	fmov	d1, x10                               //CTR block 5
+	orr	x9, x11, x9, lsl #32            //CTR block 5
+
+	add	w12, w12, #1                            //CTR block 5
+	add	x0, x0, #64                       //AES input_ptr update
+	fmov	v1.d[1], x9                               //CTR block 5
+
+	fmov	d7, x23                               //AES block 3 - mov low
+	rev	w9, w12                                 //CTR block 6
+	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
+
+	fmov	v7.d[1], x24                           //AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 6
+
+	add	w12, w12, #1                            //CTR block 6
+	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
+	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
+
+	fmov	d2, x10                               //CTR block 6
+	cmp	x0, x5                   //check if we have <= 8 blocks
+
+	fmov	v2.d[1], x9                               //CTR block 6
+	rev	w9, w12                                 //CTR block 7
+	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
+
+	orr	x9, x11, x9, lsl #32            //CTR block 7
+
+	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
+	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
+	b.ge	.L128_enc_prepretail                              //do prepretail
+
+.L128_enc_main_loop:	//main	loop start
+	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x23, x23
+	rev	x24, x24
+#endif
+	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
+	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
+
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
+	fmov	d3, x10                               //CTR block 4k+3
+
+	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
+	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
+
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
+	add	w12, w12, #1                            //CTR block 4k+3
+	fmov	v3.d[1], x9                               //CTR block 4k+3
+
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
+	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
+
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
+	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
+
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
+	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
+
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
+	eor	x24, x24, x14                     //AES block 4k+3 - round 10 high
+
+	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
+	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
+	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x6, x6
+	rev	x7, x7
+#endif
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
+	rev	w9, w12                                 //CTR block 4k+8
+
+	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
+	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
+
+	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
+	add	w12, w12, #1                            //CTR block 4k+8
+	mov	d10, v17.d[1]                               //GHASH block 4k - mid
+
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
+
+	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
+
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
+
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
+	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
+
+	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
+
+	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
+	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
+
+	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
+
+	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
+	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
+
+	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+
+	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
+	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
+
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
+	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
+
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
+	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
+
+	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
+
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
+	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
+
+	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
+
+	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
+	movi	v8.8b, #0xc2
+
+	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
+	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
+
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
+
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
+	shl	d8, d8, #56               //mod_constant
+
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
+	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
+
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
+	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x19, x19
+	rev	x20, x20
+#endif
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
+	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
+
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
+	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x21, x21
+	rev	x22, x22
+#endif
+	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
+	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
+
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
+	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
+
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
+	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
+
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
+	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
+
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
+	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
+
+	fmov	d4, x6                               //AES block 4k+4 - mov low
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
+	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
+
+	add	x0, x0, #64                       //AES input_ptr update
+	fmov	d7, x23                               //AES block 4k+3 - mov low
+	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
+
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
+	fmov	d5, x19                               //AES block 4k+5 - mov low
+
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
+
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
+	eor	x20, x20, x14                     //AES block 4k+5 - round 10 high
+
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
+	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
+
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
+	fmov	v7.d[1], x24                           //AES block 4k+3 - mov high
+
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
+	cmp	x0, x5                   //.LOOP CONTROL
+
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
+
+	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
+	eor	x21, x21, x13                     //AES block 4k+6 - round 10 low
+	eor	x22, x22, x14                     //AES block 4k+6 - round 10 high
+
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
+	fmov	d6, x21                               //AES block 4k+6 - mov low
+
+	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
+	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
+
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
+	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
+
+	fmov	d0, x10                               //CTR block 4k+8
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
+
+	fmov	v0.d[1], x9                               //CTR block 4k+8
+	rev	w9, w12                                 //CTR block 4k+9
+	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
+
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
+	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
+
+	add	w12, w12, #1                            //CTR block 4k+9
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
+	fmov	d1, x10                               //CTR block 4k+9
+
+	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
+	fmov	v1.d[1], x9                               //CTR block 4k+9
+	rev	w9, w12                                 //CTR block 4k+10
+
+	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
+	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
+	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
+
+	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
+	add	w12, w12, #1                            //CTR block 4k+10
+	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
+	fmov	d2, x10                               //CTR block 4k+10
+
+	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
+	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
+
+	fmov	v2.d[1], x9                               //CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
+	rev	w9, w12                                 //CTR block 4k+11
+
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+3 - result
+
+	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
+	st1	{ v7.16b}, [x2], #16                     //AES block 4k+3 - store result
+	b.lt	.L128_enc_main_loop
+
+.L128_enc_prepretail:	//PREPRETAIL
+	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
+	fmov	d3, x10                               //CTR block 4k+3
+	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
+
+	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
+	add	w12, w12, #1                            //CTR block 4k+3
+	fmov	v3.d[1], x9                               //CTR block 4k+3
+
+	aese	v1.16b, v18.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
+
+	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
+
+	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
+
+	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
+
+	aese	v3.16b, v18.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
+	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
+
+	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
+	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
+
+	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
+	mov	d10, v17.d[1]                               //GHASH block 4k - mid
+
+	aese	v1.16b, v19.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
+	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
+
+	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
+
+	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
+	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
+
+	aese	v3.16b, v19.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
+
+	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
+	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
+
+	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
+
+	aese	v0.16b, v18.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
+	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
+
+	aese	v2.16b, v18.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
+
+	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
+	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
+
+	aese	v0.16b, v19.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
+	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
+
+	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
+
+	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
+	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
+
+	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
+
+	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
+
+	aese	v2.16b, v19.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
+	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
+
+	aese	v0.16b, v20.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
+
+	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
+	movi	v8.8b, #0xc2
+
+	aese	v2.16b, v20.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
+	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
+
+	aese	v3.16b, v20.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
+
+	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
+
+	aese	v2.16b, v21.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
+
+	aese	v1.16b, v20.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
+	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
+
+	aese	v0.16b, v21.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
+
+	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
+	shl	d8, d8, #56               //mod_constant
+
+	aese	v1.16b, v21.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
+	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
+
+	aese	v0.16b, v22.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
+
+	pmull	v28.1q, v9.1d, v8.1d
+	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
+
+	aese	v1.16b, v22.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
+
+	aese	v0.16b, v23.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
+	ext	v9.16b, v9.16b, v9.16b, #8
+
+	aese	v3.16b, v21.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
+
+	aese	v2.16b, v22.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v11.16b
+
+	aese	v0.16b, v24.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
+
+	aese	v3.16b, v22.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
+
+	aese	v1.16b, v23.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
+
+	aese	v2.16b, v23.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
+	eor	v10.16b, v10.16b, v28.16b
+
+	aese	v3.16b, v23.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
+
+	aese	v1.16b, v24.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
+
+	aese	v2.16b, v24.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
+
+	aese	v3.16b, v24.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v9.16b
+
+	aese	v0.16b, v25.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
+
+	aese	v2.16b, v25.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
+
+	aese	v3.16b, v25.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
+
+	pmull	v28.1q, v10.1d, v8.1d
+
+	aese	v1.16b, v25.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
+	ext	v10.16b, v10.16b, v10.16b, #8
+
+	aese	v3.16b, v26.16b
+	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
+
+	aese	v0.16b, v26.16b
+	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v28.16b
+
+	aese	v1.16b, v26.16b
+	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
+
+	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
+
+	aese	v2.16b, v26.16b
+	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
+
+	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
+
+	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
+	eor	v11.16b, v11.16b, v10.16b
+
+	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
+.L128_enc_tail:	//TAIL
+
+	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
+	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
+#ifdef __AARCH64EB__
+	rev	x6, x6
+	rev	x7, x7
+#endif
+	cmp	x5, #48
+
+	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+
+	fmov	d4, x6                               //AES block 4k+4 - mov low
+
+	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
+
+	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
+
+	b.gt	.L128_enc_blocks_more_than_3
+
+	sub	w12, w12, #1
+	movi	v11.8b, #0
+	mov	v3.16b, v2.16b
+
+	cmp	x5, #32
+	mov	v2.16b, v1.16b
+	movi	v9.8b, #0
+
+	movi	v10.8b, #0
+	b.gt	.L128_enc_blocks_more_than_2
+
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+
+	sub	w12, w12, #1
+	b.gt	.L128_enc_blocks_more_than_1
+
+	sub	w12, w12, #1
+	b	.L128_enc_blocks_less_than_1
+.L128_enc_blocks_more_than_3:	//blocks	left >  3
+	st1	{ v5.16b}, [x2], #16                     //AES final-3 block  - store result
+
+	ldp	x6, x7, [x0], #16           //AES final-2 block - load input low & high
+#ifdef __AARCH64EB__
+	rev	x6, x6
+	rev	x7, x7
+#endif
+	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
+
+	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
+	eor	x7, x7, x14                     //AES final-2 block - round 10 high
+	eor	x6, x6, x13                     //AES final-2 block - round 10 low
+
+	fmov	d5, x6                                 //AES final-2 block - mov low
+
+	movi	v8.8b, #0                                        //suppress further partial tag feed in
+	fmov	v5.d[1], x7                             //AES final-2 block - mov high
+
+	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
+	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
+
+	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
+
+	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
+
+	eor	v5.16b, v5.16b, v1.16b                            //AES final-2 block - result
+	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
+
+	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
+.L128_enc_blocks_more_than_2:	//blocks	left >  2
+
+	st1	{ v5.16b}, [x2], #16                     //AES final-2 block - store result
+
+	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
+	ldp	x6, x7, [x0], #16           //AES final-1 block - load input low & high
+#ifdef __AARCH64EB__
+	rev	x6, x6
+	rev	x7, x7
+#endif
+	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
+
+	eor	x6, x6, x13                     //AES final-1 block - round 10 low
+
+	fmov	d5, x6                                 //AES final-1 block - mov low
+	eor	x7, x7, x14                     //AES final-1 block - round 10 high
+
+	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
+	fmov	v5.d[1], x7                             //AES final-1 block - mov high
+
+	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
+
+	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
+
+	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
+
+	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
+
+	eor	v5.16b, v5.16b, v2.16b                            //AES final-1 block - result
+
+	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
+
+	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
+
+	movi	v8.8b, #0                                        //suppress further partial tag feed in
+
+	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
+.L128_enc_blocks_more_than_1:	//blocks	left >  1
*** 45317 LINES SKIPPED ***