git: 4f3a6a07112b - releng/14.0 - ossl: Update the generated assembly files from OpenSSL 3.0.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 25 Oct 2023 20:06:28 UTC
The branch releng/14.0 has been updated by emaste:
URL: https://cgit.FreeBSD.org/src/commit/?id=4f3a6a07112b4f4f3a04ee9a7de01598c0b1d30f
commit 4f3a6a07112b4f4f3a04ee9a7de01598c0b1d30f
Author: John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2023-08-29 21:44:15 +0000
Commit: Ed Maste <emaste@FreeBSD.org>
CommitDate: 2023-10-25 19:56:23 +0000
ossl: Update the generated assembly files from OpenSSL 3.0.
Tested with: cryptocheck -d ossl0 -a all -z on amd64
Reviewed by: markj
Differential Revision: https://reviews.freebsd.org/D41568
(cherry picked from commit c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
(cherry picked from commit f0d83d53c3be75ffc7711ba8171af9b934459810)
Approved by: re (gjb)
---
sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S | 6390 +++++++++++++++++++
sys/crypto/openssl/aarch64/aesv8-armx.S | 3014 ++++++++-
sys/crypto/openssl/aarch64/arm64cpuid.S | 7 +
sys/crypto/openssl/aarch64/armv8-mont.S | 732 ++-
sys/crypto/openssl/aarch64/chacha-armv8.S | 1553 ++---
sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S | 8 +-
sys/crypto/openssl/aarch64/ghashv8-armx.S | 1 +
sys/crypto/openssl/aarch64/keccak1600-armv8.S | 190 +-
sys/crypto/openssl/aarch64/poly1305-armv8.S | 31 +-
sys/crypto/openssl/aarch64/sha1-armv8.S | 54 +-
sys/crypto/openssl/aarch64/sha256-armv8.S | 28 +-
sys/crypto/openssl/aarch64/sha512-armv8.S | 28 +-
sys/crypto/openssl/aarch64/vpaes-armv8.S | 276 +-
sys/crypto/openssl/amd64/aes-x86_64.S | 2680 ++++++++
sys/crypto/openssl/amd64/aesni-gcm-x86_64.S | 21 +
sys/crypto/openssl/amd64/aesni-mb-x86_64.S | 102 +
sys/crypto/openssl/amd64/aesni-sha1-x86_64.S | 21 +
sys/crypto/openssl/amd64/aesni-sha256-x86_64.S | 21 +
sys/crypto/openssl/amd64/aesni-x86_64.S | 32 +
sys/crypto/openssl/amd64/bsaes-x86_64.S | 2619 ++++++++
sys/crypto/openssl/amd64/chacha-x86_64.S | 21 +
sys/crypto/openssl/amd64/cmll-x86_64.S | 22 +
sys/crypto/openssl/amd64/e_padlock-x86_64.S | 21 +
sys/crypto/openssl/amd64/ecp_nistz256-x86_64.S | 21 +
sys/crypto/openssl/amd64/ghash-x86_64.S | 27 +
sys/crypto/openssl/amd64/keccak1600-x86_64.S | 21 +
sys/crypto/openssl/amd64/md5-x86_64.S | 29 +-
sys/crypto/openssl/amd64/poly1305-x86_64.S | 21 +
sys/crypto/openssl/amd64/rc4-md5-x86_64.S | 21 +
sys/crypto/openssl/amd64/rc4-x86_64.S | 24 +
sys/crypto/openssl/amd64/rsaz-avx2.S | 21 +
sys/crypto/openssl/amd64/rsaz-avx512.S | 902 +++
sys/crypto/openssl/amd64/rsaz-x86_64.S | 21 +
sys/crypto/openssl/amd64/sha1-mb-x86_64.S | 57 +
sys/crypto/openssl/amd64/sha1-x86_64.S | 21 +
sys/crypto/openssl/amd64/sha256-mb-x86_64.S | 57 +
sys/crypto/openssl/amd64/sha256-x86_64.S | 21 +
sys/crypto/openssl/amd64/sha512-x86_64.S | 21 +
sys/crypto/openssl/amd64/vpaes-x86_64.S | 26 +
sys/crypto/openssl/amd64/wp-x86_64.S | 21 +
sys/crypto/openssl/amd64/x25519-x86_64.S | 21 +
sys/crypto/openssl/amd64/x86_64-gf2m.S | 21 +
sys/crypto/openssl/amd64/x86_64-mont.S | 21 +
sys/crypto/openssl/amd64/x86_64-mont5.S | 21 +
sys/crypto/openssl/amd64/x86_64cpuid.S | 49 +
sys/crypto/openssl/arm/aes-armv4.S | 7 +-
sys/crypto/openssl/arm/aesv8-armx.S | 776 ++-
sys/crypto/openssl/arm/armv4-gf2m.S | 13 +-
sys/crypto/openssl/arm/armv4-mont.S | 17 +-
sys/crypto/openssl/arm/armv4cpuid.S | 3 +-
sys/crypto/openssl/arm/bsaes-armv7.S | 47 +-
sys/crypto/openssl/arm/chacha-armv4.S | 11 +-
sys/crypto/openssl/arm/ecp_nistz256-armv4.S | 4 +-
sys/crypto/openssl/arm/ghash-armv4.S | 3 +-
sys/crypto/openssl/arm/ghashv8-armx.S | 64 +-
sys/crypto/openssl/arm/keccak1600-armv4.S | 34 +-
sys/crypto/openssl/arm/poly1305-armv4.S | 37 +-
sys/crypto/openssl/arm/sha1-armv4-large.S | 15 +-
sys/crypto/openssl/arm/sha256-armv4.S | 17 +-
sys/crypto/openssl/arm/sha512-armv4.S | 15 +-
sys/crypto/openssl/i386/aes-586.S | 6644 ++++++++++++++++++++
sys/crypto/openssl/i386/aesni-x86.S | 254 +
sys/crypto/openssl/i386/bf-586.S | 134 +
sys/crypto/openssl/i386/bn-586.S | 104 +
sys/crypto/openssl/i386/cast-586.S | 134 +
sys/crypto/openssl/i386/chacha-x86.S | 64 +
sys/crypto/openssl/i386/cmll-x86.S | 144 +
sys/crypto/openssl/i386/co-586.S | 74 +
sys/crypto/openssl/i386/crypt586.S | 44 +
sys/crypto/openssl/i386/des-586.S | 254 +
sys/crypto/openssl/i386/e_padlock-x86.S | 214 +
sys/crypto/openssl/i386/ecp_nistz256-x86.S | 254 +
sys/crypto/openssl/i386/ghash-x86.S | 104 +
sys/crypto/openssl/i386/md5-586.S | 64 +-
sys/crypto/openssl/i386/poly1305-x86.S | 114 +
sys/crypto/openssl/i386/rc4-586.S | 64 +
sys/crypto/openssl/i386/rc5-586.S | 134 +
sys/crypto/openssl/i386/rmd-586.S | 44 +
sys/crypto/openssl/i386/sha1-586.S | 74 +
sys/crypto/openssl/i386/sha256-586.S | 44 +
sys/crypto/openssl/i386/sha512-586.S | 44 +
sys/crypto/openssl/i386/vpaes-x86.S | 164 +
sys/crypto/openssl/i386/wp-mmx.S | 44 +
sys/crypto/openssl/i386/x86-gf2m.S | 64 +
sys/crypto/openssl/i386/x86-mont.S | 44 +
sys/crypto/openssl/i386/x86cpuid.S | 154 +
sys/crypto/openssl/powerpc/bn-ppc.S | 1855 ++++++
sys/crypto/openssl/powerpc/poly1305-ppc.S | 1091 +++-
sys/crypto/openssl/powerpc/vpaes-ppc.S | 14 +-
sys/crypto/openssl/powerpc64/bn-ppc.S | 1876 ++++++
sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S | 354 ++
sys/crypto/openssl/powerpc64/keccak1600-ppc64.S | 32 +-
sys/crypto/openssl/powerpc64/poly1305-ppc.S | 1011 ++-
sys/crypto/openssl/powerpc64/vpaes-ppc.S | 14 +-
sys/crypto/openssl/powerpc64le/bn-ppc.S | 1876 ++++++
.../openssl/powerpc64le/ecp_nistp521-ppc64.S | 354 ++
sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S | 32 +-
sys/crypto/openssl/powerpc64le/poly1305-ppc.S | 1002 ++-
sys/crypto/openssl/powerpc64le/vpaes-ppc.S | 14 +-
99 files changed, 37489 insertions(+), 1910 deletions(-)
diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
new file mode 100644
index 000000000000..eb85dbc9f996
--- /dev/null
+++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S
@@ -0,0 +1,6390 @@
+/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+.arch armv8-a+crypto
+.text
+.globl aes_gcm_enc_128_kernel
+.type aes_gcm_enc_128_kernel,%function
+.align 4
+aes_gcm_enc_128_kernel:
+ cbz x1, .L128_enc_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+ mov x8, x5
+ stp x21, x22, [sp, #16]
+ stp x23, x24, [sp, #32]
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #64]
+ stp d12, d13, [sp, #80]
+ stp d14, d15, [sp, #96]
+
+ ldp x10, x11, [x16] //ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+ rev x10, x10
+ rev x11, x11
+#endif
+ ldp x13, x14, [x8, #160] //load rk10
+#ifdef __AARCH64EB__
+ ror x13, x13, #32
+ ror x14, x14, #32
+#endif
+ ld1 {v11.16b}, [x3]
+ ext v11.16b, v11.16b, v11.16b, #8
+ rev64 v11.16b, v11.16b
+ lsr x5, x1, #3 //byte_len
+ mov x15, x5
+
+ ld1 {v18.4s}, [x8], #16 //load rk0
+ add x4, x0, x1, lsr #3 //end_input_ptr
+ sub x5, x5, #1 //byte_len - 1
+
+ lsr x12, x11, #32
+ ldr q15, [x3, #112] //load h4l | h4h
+#ifndef __AARCH64EB__
+ ext v15.16b, v15.16b, v15.16b, #8
+#endif
+ fmov d1, x10 //CTR block 1
+ rev w12, w12 //rev_ctr32
+
+ add w12, w12, #1 //increment rev_ctr32
+ orr w11, w11, w11
+ ld1 {v19.4s}, [x8], #16 //load rk1
+
+ rev w9, w12 //CTR block 1
+ add w12, w12, #1 //CTR block 1
+ fmov d3, x10 //CTR block 3
+
+ orr x9, x11, x9, lsl #32 //CTR block 1
+ ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible
+
+ fmov v1.d[1], x9 //CTR block 1
+ rev w9, w12 //CTR block 2
+
+ fmov d2, x10 //CTR block 2
+ orr x9, x11, x9, lsl #32 //CTR block 2
+ add w12, w12, #1 //CTR block 2
+
+ fmov v2.d[1], x9 //CTR block 2
+ rev w9, w12 //CTR block 3
+
+ orr x9, x11, x9, lsl #32 //CTR block 3
+ ld1 {v20.4s}, [x8], #16 //load rk2
+
+ add w12, w12, #1 //CTR block 3
+ fmov v3.d[1], x9 //CTR block 3
+
+ ldr q14, [x3, #80] //load h3l | h3h
+#ifndef __AARCH64EB__
+ ext v14.16b, v14.16b, v14.16b, #8
+#endif
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 0
+ ld1 {v21.4s}, [x8], #16 //load rk3
+
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 0
+ ldr q12, [x3, #32] //load h1l | h1h
+#ifndef __AARCH64EB__
+ ext v12.16b, v12.16b, v12.16b, #8
+#endif
+
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 0
+ ld1 {v22.4s}, [x8], #16 //load rk4
+
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 0
+ ld1 {v23.4s}, [x8], #16 //load rk5
+
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 1
+ trn2 v17.2d, v14.2d, v15.2d //h4l | h3l
+
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 1
+ ld1 {v24.4s}, [x8], #16 //load rk6
+
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 1
+ ld1 {v25.4s}, [x8], #16 //load rk7
+
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 1
+ trn1 v9.2d, v14.2d, v15.2d //h4h | h3h
+
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 2
+ ld1 {v26.4s}, [x8], #16 //load rk8
+
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 2
+ ldr q13, [x3, #64] //load h2l | h2h
+#ifndef __AARCH64EB__
+ ext v13.16b, v13.16b, v13.16b, #8
+#endif
+
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 2
+
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 2
+ eor v17.16b, v17.16b, v9.16b //h4k | h3k
+
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 3
+
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 3
+
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 3
+ ld1 {v27.4s}, [x8], #16 //load rk9
+
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 3
+
+ and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+ trn2 v16.2d, v12.2d, v13.2d //h2l | h1l
+
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 4
+ add x5, x5, x0
+
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 4
+ cmp x0, x5 //check if we have <= 4 blocks
+
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 4
+
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 5
+
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 5
+
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 5
+
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 6
+
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 4
+
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 6
+ trn1 v8.2d, v12.2d, v13.2d //h2h | h1h
+
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 6
+
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 5
+
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 7
+
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 7
+
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 6
+
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 7
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 0 - round 8
+
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 7
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 2 - round 8
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 3 - round 8
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 1 - round 8
+
+ aese v2.16b, v27.16b //AES block 2 - round 9
+
+ aese v0.16b, v27.16b //AES block 0 - round 9
+
+ eor v16.16b, v16.16b, v8.16b //h2k | h1k
+
+ aese v1.16b, v27.16b //AES block 1 - round 9
+
+ aese v3.16b, v27.16b //AES block 3 - round 9
+ b.ge .L128_enc_tail //handle tail
+
+ ldp x6, x7, [x0, #0] //AES block 0 - load plaintext
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
+ ldp x21, x22, [x0, #32] //AES block 2 - load plaintext
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
+ ldp x19, x20, [x0, #16] //AES block 1 - load plaintext
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
+ ldp x23, x24, [x0, #48] //AES block 3 - load plaintext
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
+ eor x6, x6, x13 //AES block 0 - round 10 low
+ eor x7, x7, x14 //AES block 0 - round 10 high
+
+ eor x21, x21, x13 //AES block 2 - round 10 low
+ fmov d4, x6 //AES block 0 - mov low
+
+ eor x19, x19, x13 //AES block 1 - round 10 low
+ eor x22, x22, x14 //AES block 2 - round 10 high
+ fmov v4.d[1], x7 //AES block 0 - mov high
+
+ fmov d5, x19 //AES block 1 - mov low
+ eor x20, x20, x14 //AES block 1 - round 10 high
+
+ eor x23, x23, x13 //AES block 3 - round 10 low
+ fmov v5.d[1], x20 //AES block 1 - mov high
+
+ fmov d6, x21 //AES block 2 - mov low
+ eor x24, x24, x14 //AES block 3 - round 10 high
+ rev w9, w12 //CTR block 4
+
+ fmov v6.d[1], x22 //AES block 2 - mov high
+ orr x9, x11, x9, lsl #32 //CTR block 4
+
+ eor v4.16b, v4.16b, v0.16b //AES block 0 - result
+ fmov d0, x10 //CTR block 4
+ add w12, w12, #1 //CTR block 4
+
+ fmov v0.d[1], x9 //CTR block 4
+ rev w9, w12 //CTR block 5
+
+ eor v5.16b, v5.16b, v1.16b //AES block 1 - result
+ fmov d1, x10 //CTR block 5
+ orr x9, x11, x9, lsl #32 //CTR block 5
+
+ add w12, w12, #1 //CTR block 5
+ add x0, x0, #64 //AES input_ptr update
+ fmov v1.d[1], x9 //CTR block 5
+
+ fmov d7, x23 //AES block 3 - mov low
+ rev w9, w12 //CTR block 6
+ st1 { v4.16b}, [x2], #16 //AES block 0 - store result
+
+ fmov v7.d[1], x24 //AES block 3 - mov high
+ orr x9, x11, x9, lsl #32 //CTR block 6
+
+ add w12, w12, #1 //CTR block 6
+ eor v6.16b, v6.16b, v2.16b //AES block 2 - result
+ st1 { v5.16b}, [x2], #16 //AES block 1 - store result
+
+ fmov d2, x10 //CTR block 6
+ cmp x0, x5 //check if we have <= 8 blocks
+
+ fmov v2.d[1], x9 //CTR block 6
+ rev w9, w12 //CTR block 7
+ st1 { v6.16b}, [x2], #16 //AES block 2 - store result
+
+ orr x9, x11, x9, lsl #32 //CTR block 7
+
+ eor v7.16b, v7.16b, v3.16b //AES block 3 - result
+ st1 { v7.16b}, [x2], #16 //AES block 3 - store result
+ b.ge .L128_enc_prepretail //do prepretail
+
+.L128_enc_main_loop: //main loop start
+ ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext
+#ifdef __AARCH64EB__
+ rev x23, x23
+ rev x24, x24
+#endif
+ rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
+ rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
+
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
+ fmov d3, x10 //CTR block 4k+3
+
+ ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
+ rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
+
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
+ add w12, w12, #1 //CTR block 4k+3
+ fmov v3.d[1], x9 //CTR block 4k+3
+
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
+ mov d31, v6.d[1] //GHASH block 4k+2 - mid
+
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
+ mov d30, v5.d[1] //GHASH block 4k+1 - mid
+
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
+ eor v4.16b, v4.16b, v11.16b //PRE 1
+
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
+ eor x24, x24, x14 //AES block 4k+3 - round 10 high
+
+ pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
+ eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
+ ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
+ rev w9, w12 //CTR block 4k+8
+
+ eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid
+ mov d8, v4.d[1] //GHASH block 4k - mid
+ orr x9, x11, x9, lsl #32 //CTR block 4k+8
+
+ pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
+ add w12, w12, #1 //CTR block 4k+8
+ mov d10, v17.d[1] //GHASH block 4k - mid
+
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
+
+ pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
+ eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
+
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
+
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
+ eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high
+
+ pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
+
+ pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
+ rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
+
+ pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid
+
+ pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
+ ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
+
+ pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
+ eor x7, x7, x14 //AES block 4k+4 - round 10 high
+
+ eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid
+ mov d30, v7.d[1] //GHASH block 4k+3 - mid
+
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
+ eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low
+
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
+ eor x6, x6, x13 //AES block 4k+4 - round 10 low
+
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
+ eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
+
+ pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
+
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
+ eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high
+
+ pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
+
+ pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
+ movi v8.8b, #0xc2
+
+ pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
+ eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low
+
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
+
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
+ shl d8, d8, #56 //mod_constant
+
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
+ eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high
+
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
+ ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext
+#ifdef __AARCH64EB__
+ rev x19, x19
+ rev x20, x20
+#endif
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
+ eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
+
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
+ ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext
+#ifdef __AARCH64EB__
+ rev x21, x21
+ rev x22, x22
+#endif
+ pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid
+ eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
+
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
+ eor x19, x19, x13 //AES block 4k+5 - round 10 low
+
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
+ eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
+
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
+ eor x23, x23, x13 //AES block 4k+3 - round 10 low
+
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
+ eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up
+
+ fmov d4, x6 //AES block 4k+4 - mov low
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
+ fmov v4.d[1], x7 //AES block 4k+4 - mov high
+
+ add x0, x0, #64 //AES input_ptr update
+ fmov d7, x23 //AES block 4k+3 - mov low
+ ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment
+
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
+ fmov d5, x19 //AES block 4k+5 - mov low
+
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
+ eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up
+
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
+ eor x20, x20, x14 //AES block 4k+5 - round 10 high
+
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
+ fmov v5.d[1], x20 //AES block 4k+5 - mov high
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
+ fmov v7.d[1], x24 //AES block 4k+3 - mov high
+
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
+ cmp x0, x5 //.LOOP CONTROL
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
+ eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid
+
+ aese v0.16b, v27.16b //AES block 4k+4 - round 9
+ eor x21, x21, x13 //AES block 4k+6 - round 10 low
+ eor x22, x22, x14 //AES block 4k+6 - round 10 high
+
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
+ fmov d6, x21 //AES block 4k+6 - mov low
+
+ aese v1.16b, v27.16b //AES block 4k+5 - round 9
+ fmov v6.d[1], x22 //AES block 4k+6 - mov high
+
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
+ eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result
+
+ fmov d0, x10 //CTR block 4k+8
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
+
+ fmov v0.d[1], x9 //CTR block 4k+8
+ rev w9, w12 //CTR block 4k+9
+ eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
+ eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result
+
+ add w12, w12, #1 //CTR block 4k+9
+ orr x9, x11, x9, lsl #32 //CTR block 4k+9
+ fmov d1, x10 //CTR block 4k+9
+
+ pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low
+ fmov v1.d[1], x9 //CTR block 4k+9
+ rev w9, w12 //CTR block 4k+10
+
+ aese v2.16b, v27.16b //AES block 4k+6 - round 9
+ st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result
+ eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result
+ orr x9, x11, x9, lsl #32 //CTR block 4k+10
+
+ aese v3.16b, v27.16b //AES block 4k+7 - round 9
+ add w12, w12, #1 //CTR block 4k+10
+ ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment
+ fmov d2, x10 //CTR block 4k+10
+
+ eor v11.16b, v11.16b, v9.16b //MODULO - fold into low
+ st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result
+
+ fmov v2.d[1], x9 //CTR block 4k+10
+ st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result
+ rev w9, w12 //CTR block 4k+11
+
+ orr x9, x11, x9, lsl #32 //CTR block 4k+11
+ eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result
+
+ eor v11.16b, v11.16b, v10.16b //MODULO - fold into low
+ st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result
+ b.lt .L128_enc_main_loop
+
+.L128_enc_prepretail: //PREPRETAIL
+ rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free)
+ fmov d3, x10 //CTR block 4k+3
+ rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free)
+
+ ext v11.16b, v11.16b, v11.16b, #8 //PRE 0
+ add w12, w12, #1 //CTR block 4k+3
+ fmov v3.d[1], x9 //CTR block 4k+3
+
+ aese v1.16b, v18.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 0
+ rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free)
+
+ pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low
+
+ rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free)
+ eor v4.16b, v4.16b, v11.16b //PRE 1
+
+ pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high
+
+ aese v3.16b, v18.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 0
+ mov d30, v5.d[1] //GHASH block 4k+1 - mid
+
+ pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low
+ mov d8, v4.d[1] //GHASH block 4k - mid
+
+ mov d31, v6.d[1] //GHASH block 4k+2 - mid
+ mov d10, v17.d[1] //GHASH block 4k - mid
+
+ aese v1.16b, v19.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 1
+ eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid
+
+ eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid
+
+ pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high
+ eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid
+
+ aese v3.16b, v19.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 1
+
+ pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid
+ eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low
+
+ pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid
+
+ aese v0.16b, v18.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 0
+ ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid
+
+ aese v2.16b, v18.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 0
+
+ eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid
+ mov d30, v7.d[1] //GHASH block 4k+3 - mid
+
+ aese v0.16b, v19.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 1
+ eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high
+
+ pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid
+
+ pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high
+ eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid
+
+ pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high
+
+ pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low
+
+ aese v2.16b, v19.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 1
+ eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high
+
+ aese v0.16b, v20.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 2
+
+ pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low
+ movi v8.8b, #0xc2
+
+ aese v2.16b, v20.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 2
+ eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low
+
+ aese v3.16b, v20.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 2
+
+ pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid
+ eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid
+
+ aese v2.16b, v21.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 3
+
+ aese v1.16b, v20.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 2
+ eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high
+
+ aese v0.16b, v21.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 3
+
+ eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid
+ shl d8, d8, #56 //mod_constant
+
+ aese v1.16b, v21.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 3
+ eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low
+
+ aese v0.16b, v22.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 4
+
+ pmull v28.1q, v9.1d, v8.1d
+ eor v10.16b, v10.16b, v9.16b //karatsuba tidy up
+
+ aese v1.16b, v22.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 4
+
+ aese v0.16b, v23.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 5
+ ext v9.16b, v9.16b, v9.16b, #8
+
+ aese v3.16b, v21.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 3
+
+ aese v2.16b, v22.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 4
+ eor v10.16b, v10.16b, v11.16b
+
+ aese v0.16b, v24.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 6
+
+ aese v3.16b, v22.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 4
+
+ aese v1.16b, v23.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 5
+
+ aese v2.16b, v23.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 5
+ eor v10.16b, v10.16b, v28.16b
+
+ aese v3.16b, v23.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 5
+
+ aese v1.16b, v24.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 6
+
+ aese v2.16b, v24.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 6
+
+ aese v3.16b, v24.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 6
+ eor v10.16b, v10.16b, v9.16b
+
+ aese v0.16b, v25.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 7
+
+ aese v2.16b, v25.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 7
+
+ aese v3.16b, v25.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 7
+
+ pmull v28.1q, v10.1d, v8.1d
+
+ aese v1.16b, v25.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 7
+ ext v10.16b, v10.16b, v10.16b, #8
+
+ aese v3.16b, v26.16b
+ aesmc v3.16b, v3.16b //AES block 4k+7 - round 8
+
+ aese v0.16b, v26.16b
+ aesmc v0.16b, v0.16b //AES block 4k+4 - round 8
+ eor v11.16b, v11.16b, v28.16b
+
+ aese v1.16b, v26.16b
+ aesmc v1.16b, v1.16b //AES block 4k+5 - round 8
+
+ aese v3.16b, v27.16b //AES block 4k+7 - round 9
+
+ aese v2.16b, v26.16b
+ aesmc v2.16b, v2.16b //AES block 4k+6 - round 8
+
+ aese v0.16b, v27.16b //AES block 4k+4 - round 9
+
+ aese v1.16b, v27.16b //AES block 4k+5 - round 9
+ eor v11.16b, v11.16b, v10.16b
+
+ aese v2.16b, v27.16b //AES block 4k+6 - round 9
+.L128_enc_tail: //TAIL
+
+ sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
+ ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
+ cmp x5, #48
+
+ ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag
+ eor x6, x6, x13 //AES block 4k+4 - round 10 low
+ eor x7, x7, x14 //AES block 4k+4 - round 10 high
+
+ fmov d4, x6 //AES block 4k+4 - mov low
+
+ fmov v4.d[1], x7 //AES block 4k+4 - mov high
+
+ eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result
+
+ b.gt .L128_enc_blocks_more_than_3
+
+ sub w12, w12, #1
+ movi v11.8b, #0
+ mov v3.16b, v2.16b
+
+ cmp x5, #32
+ mov v2.16b, v1.16b
+ movi v9.8b, #0
+
+ movi v10.8b, #0
+ b.gt .L128_enc_blocks_more_than_2
+
+ mov v3.16b, v1.16b
+ cmp x5, #16
+
+ sub w12, w12, #1
+ b.gt .L128_enc_blocks_more_than_1
+
+ sub w12, w12, #1
+ b .L128_enc_blocks_less_than_1
+.L128_enc_blocks_more_than_3: //blocks left > 3
+ st1 { v5.16b}, [x2], #16 //AES final-3 block - store result
+
+ ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
+ rev64 v4.16b, v5.16b //GHASH final-3 block
+
+ eor v4.16b, v4.16b, v8.16b //feed in partial tag
+ eor x7, x7, x14 //AES final-2 block - round 10 high
+ eor x6, x6, x13 //AES final-2 block - round 10 low
+
+ fmov d5, x6 //AES final-2 block - mov low
+
+ movi v8.8b, #0 //suppress further partial tag feed in
+ fmov v5.d[1], x7 //AES final-2 block - mov high
+
+ pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low
+ mov d22, v4.d[1] //GHASH final-3 block - mid
+
+ pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high
+
+ mov d10, v17.d[1] //GHASH final-3 block - mid
+
+ eor v5.16b, v5.16b, v1.16b //AES final-2 block - result
+ eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid
+
+ pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid
+.L128_enc_blocks_more_than_2: //blocks left > 2
+
+ st1 { v5.16b}, [x2], #16 //AES final-2 block - store result
+
+ rev64 v4.16b, v5.16b //GHASH final-2 block
+ ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high
+#ifdef __AARCH64EB__
+ rev x6, x6
+ rev x7, x7
+#endif
+ eor v4.16b, v4.16b, v8.16b //feed in partial tag
+
+ eor x6, x6, x13 //AES final-1 block - round 10 low
+
+ fmov d5, x6 //AES final-1 block - mov low
+ eor x7, x7, x14 //AES final-1 block - round 10 high
+
+ pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high
+ fmov v5.d[1], x7 //AES final-1 block - mov high
+
+ mov d22, v4.d[1] //GHASH final-2 block - mid
+
+ pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low
+
+ eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
+
+ eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid
+
+ eor v5.16b, v5.16b, v2.16b //AES final-1 block - result
+
+ eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low
+
+ pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid
+
+ movi v8.8b, #0 //suppress further partial tag feed in
+
+ eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid
+.L128_enc_blocks_more_than_1: //blocks left > 1
*** 45317 LINES SKIPPED ***