svn commit: r338875 - in projects/openssl111/secure/lib/libcrypto: . aarch64
Jung-uk Kim
jkim at FreeBSD.org
Sat Sep 22 02:23:05 UTC 2018
Author: jkim
Date: Sat Sep 22 02:23:03 2018
New Revision: 338875
URL: https://svnweb.freebsd.org/changeset/base/338875
Log:
Regen assemply files for aarch64.
Added:
projects/openssl111/secure/lib/libcrypto/aarch64/armv8-mont.S (contents, props changed)
projects/openssl111/secure/lib/libcrypto/aarch64/chacha-armv8.S (contents, props changed)
projects/openssl111/secure/lib/libcrypto/aarch64/ecp_nistz256-armv8.S (contents, props changed)
projects/openssl111/secure/lib/libcrypto/aarch64/keccak1600-armv8.S (contents, props changed)
projects/openssl111/secure/lib/libcrypto/aarch64/poly1305-armv8.S (contents, props changed)
projects/openssl111/secure/lib/libcrypto/aarch64/vpaes-armv8.S (contents, props changed)
Modified:
projects/openssl111/secure/lib/libcrypto/Makefile.asm
projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S
projects/openssl111/secure/lib/libcrypto/aarch64/ghashv8-armx.S
projects/openssl111/secure/lib/libcrypto/aarch64/sha1-armv8.S
projects/openssl111/secure/lib/libcrypto/aarch64/sha256-armv8.S
projects/openssl111/secure/lib/libcrypto/aarch64/sha512-armv8.S
Modified: projects/openssl111/secure/lib/libcrypto/Makefile.asm
==============================================================================
--- projects/openssl111/secure/lib/libcrypto/Makefile.asm Sat Sep 22 01:24:30 2018 (r338874)
+++ projects/openssl111/secure/lib/libcrypto/Makefile.asm Sat Sep 22 02:23:03 2018 (r338875)
@@ -10,19 +10,35 @@
.PATH: ${LCRYPTO_SRC}/crypto \
${LCRYPTO_SRC}/crypto/aes/asm \
+ ${LCRYPTO_SRC}/crypto/bn/asm \
+ ${LCRYPTO_SRC}/crypto/chacha/asm \
+ ${LCRYPTO_SRC}/crypto/ec/asm \
${LCRYPTO_SRC}/crypto/modes/asm \
+ ${LCRYPTO_SRC}/crypto/poly1305/asm \
${LCRYPTO_SRC}/crypto/sha/asm
PERLPATH= -I${LCRYPTO_SRC}/crypto/perlasm
# aes
-SRCS= aesv8-armx.pl
+SRCS= aesv8-armx.pl vpaes-armv8.pl
+# bn
+SRCS+= armv8-mont.pl
+
+# chacha
+SRCS+= chacha-armv8.pl
+
+# ec
+SRCS+= ecp_nistz256-armv8.pl
+
# modes
SRCS+= ghashv8-armx.pl
+# poly1305
+SRCS+= poly1305-armv8.pl
+
# sha
-SRCS+= sha1-armv8.pl sha512-armv8.pl
+SRCS+= keccak1600-armv8.pl sha1-armv8.pl sha512-armv8.pl
ASM= ${SRCS:R:S/$/.S/} sha256-armv8.S
@@ -32,13 +48,13 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s
.SUFFIXES: .pl
sha256-armv8.S: sha512-armv8.pl
- env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/}
+ env CC=cc perl ${.ALLSRC} linux64 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
.pl.S:
- env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/}
+ env CC=cc perl ${.IMPSRC} linux64 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
@@ -160,10 +176,10 @@ CLEANFILES= ${ASM} ${SRCS:R:S/$/.s/}
aes-armv4.S: aes-armv4.pl
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.ALLSRC:T}. */' ;\
- env CC=cc perl ${.ALLSRC} elf ) > ${.TARGET}
+ env CC=cc perl ${.ALLSRC} linux32 ) > ${.TARGET}
.pl.S:
- env CC=cc perl ${.IMPSRC} elf ${.TARGET:R:S/$/.s/}
+ env CC=cc perl ${.IMPSRC} linux32 ${.TARGET:R:S/$/.s/}
( echo '/* $$'FreeBSD'$$ */' ;\
echo '/* Do not modify. This file is auto-generated from ${.IMPSRC:T:R:S/$/.pl/}. */' ;\
cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
Modified: projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S
==============================================================================
--- projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S Sat Sep 22 01:24:30 2018 (r338874)
+++ projects/openssl111/secure/lib/libcrypto/aarch64/aesv8-armx.S Sat Sep 22 02:23:03 2018 (r338875)
@@ -5,7 +5,7 @@
#if __ARM_MAX_ARCH__>=7
.text
.align 5
-rcon:
+.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
@@ -30,7 +30,7 @@ aes_v8_set_encrypt_key:
tst w1,#0x3f
b.ne .Lenc_key_abort
- adr x3,rcon
+ adr x3,.Lrcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
@@ -54,7 +54,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@@ -71,7 +71,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@@ -85,7 +85,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2]
@@ -116,7 +116,7 @@ aes_v8_set_encrypt_key:
dup v5.4s,v3.s[3]
eor v5.16b,v5.16b,v4.16b
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
ext v4.16b,v0.16b,v4.16b,#12
shl v1.16b,v1.16b,#1
eor v4.16b,v4.16b,v5.16b
@@ -147,7 +147,7 @@ aes_v8_set_encrypt_key:
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
- eor v6.16b,v6.16b,v1.16b
+ eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
@@ -291,13 +291,13 @@ aes_v8_cbc_encrypt:
ld1 {v6.16b},[x4]
ld1 {v0.16b},[x0],x8
- ld1 {v16.4s-v17.4s},[x3] // load key schedule...
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
- ld1 {v18.4s-v19.4s},[x7],#32
- ld1 {v20.4s-v21.4s},[x7],#32
- ld1 {v22.4s-v23.4s},[x7],#32
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
@@ -309,7 +309,7 @@ aes_v8_cbc_encrypt:
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
- ld1 {v2.4s-v3.4s},[x7]
+ ld1 {v2.4s,v3.4s},[x7]
add x7,x3,#16
add x6,x3,#16*4
add x12,x3,#16*5
@@ -323,7 +323,7 @@ aes_v8_cbc_encrypt:
.Loop_cbc_enc:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
+ st1 {v6.16b},[x1],#16
.Lenter_cbc_enc:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
@@ -347,21 +347,21 @@ aes_v8_cbc_encrypt:
.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
- subs x2,x2,#16
+ subs x2,x2,#16
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
+ csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
- ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
@@ -373,35 +373,35 @@ aes_v8_cbc_encrypt:
.align 5
.Lcbc_enc128:
- ld1 {v2.4s-v3.4s},[x7]
+ ld1 {v2.4s,v3.4s},[x7]
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
- st1 {v6.16b},[x1],#16
+ st1 {v6.16b},[x1],#16
.Lenter_cbc_enc128:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
- subs x2,x2,#16
+ subs x2,x2,#16
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
- csel x8,xzr,x8,eq
+ csel x8,xzr,x8,eq
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
- ld1 {v16.16b},[x0],x8
+ ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
- eor v16.16b,v16.16b,v5.16b
+ eor v16.16b,v16.16b,v5.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc128
@@ -448,58 +448,58 @@ aes_v8_cbc_encrypt:
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
- eor v4.16b,v6.16b,v7.16b
- subs x2,x2,#0x30
- eor v5.16b,v2.16b,v7.16b
- csel x6,x2,x6,lo // x6, w6, is zero at this point
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
- add x0,x0,x6 // x0 is adjusted in such way that
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
- orr v6.16b,v19.16b,v19.16b
- mov x7,x3
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
- ld1 {v2.16b},[x0],#16
+ ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
- ld1 {v3.16b},[x0],#16
+ ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
- ld1 {v19.16b},[x0],#16
+ ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- add w6,w5,#2
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v4.16b},[x1],#16
- orr v0.16b,v2.16b,v2.16b
+ orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
- orr v1.16b,v3.16b,v3.16b
+ orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
- orr v18.16b,v19.16b,v19.16b
+ orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
cmn x2,#0x30
@@ -532,30 +532,30 @@ aes_v8_cbc_encrypt:
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
- cmn x2,#0x20
+ cmn x2,#0x20
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
- eor v5.16b,v6.16b,v7.16b
+ eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
- eor v17.16b,v3.16b,v7.16b
+ eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
b.eq .Lcbc_dec_one
eor v5.16b,v5.16b,v1.16b
eor v17.16b,v17.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+ orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
st1 {v17.16b},[x1],#16
b .Lcbc_done
.Lcbc_dec_one:
eor v5.16b,v5.16b,v18.16b
- orr v6.16b,v19.16b,v19.16b
+ orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
.Lcbc_done:
@@ -568,181 +568,181 @@ aes_v8_cbc_encrypt:
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
aes_v8_ctr32_encrypt_blocks:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
- ldr w5,[x3,#240]
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
- ldr w8, [x4, #12]
- ld1 {v0.4s},[x4]
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
- ld1 {v16.4s-v17.4s},[x3] // load key schedule...
- sub w5,w5,#4
- mov x12,#16
- cmp x2,#2
- add x7,x3,x5,lsl#4 // pointer to last 5 round keys
- sub w5,w5,#2
- ld1 {v20.4s-v21.4s},[x7],#32
- ld1 {v22.4s-v23.4s},[x7],#32
- ld1 {v7.4s},[x7]
- add x7,x3,#32
- mov w6,w5
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last 5 round keys
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
csel x12,xzr,x12,lo
#ifndef __ARMEB__
- rev w8, w8
+ rev w8, w8
#endif
- orr v1.16b,v0.16b,v0.16b
- add w10, w8, #1
- orr v18.16b,v0.16b,v0.16b
- add w8, w8, #2
- orr v6.16b,v0.16b,v0.16b
- rev w10, w10
- mov v1.s[3],w10
- b.ls .Lctr32_tail
- rev w12, w8
- sub x2,x2,#3 // bias
- mov v18.s[3],w12
- b .Loop3x_ctr32
+ orr v1.16b,v0.16b,v0.16b
+ add w10, w8, #1
+ orr v18.16b,v0.16b,v0.16b
+ add w8, w8, #2
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v1.s[3],w10
+ b.ls .Lctr32_tail
+ rev w12, w8
+ sub x2,x2,#3 // bias
+ mov v18.s[3],w12
+ b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- aese v18.16b,v17.16b
- aesmc v18.16b,v18.16b
- ld1 {v17.4s},[x7],#16
- b.gt .Loop3x_ctr32
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_ctr32
- aese v0.16b,v16.16b
- aesmc v4.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v5.16b,v1.16b
- ld1 {v2.16b},[x0],#16
- orr v0.16b,v6.16b,v6.16b
- aese v18.16b,v16.16b
- aesmc v18.16b,v18.16b
- ld1 {v3.16b},[x0],#16
- orr v1.16b,v6.16b,v6.16b
- aese v4.16b,v17.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v17.16b
- aesmc v5.16b,v5.16b
- ld1 {v19.16b},[x0],#16
- mov x7,x3
- aese v18.16b,v17.16b
- aesmc v17.16b,v18.16b
- orr v18.16b,v6.16b,v6.16b
- add w9,w8,#1
- aese v4.16b,v20.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v20.16b
- aesmc v5.16b,v5.16b
- eor v2.16b,v2.16b,v7.16b
- add w10,w8,#2
- aese v17.16b,v20.16b
- aesmc v17.16b,v17.16b
- eor v3.16b,v3.16b,v7.16b
- add w8,w8,#3
- aese v4.16b,v21.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v21.16b
- aesmc v5.16b,v5.16b
- eor v19.16b,v19.16b,v7.16b
- rev w9,w9
- aese v17.16b,v21.16b
- aesmc v17.16b,v17.16b
- mov v0.s[3], w9
- rev w10,w10
- aese v4.16b,v22.16b
- aesmc v4.16b,v4.16b
- aese v5.16b,v22.16b
- aesmc v5.16b,v5.16b
- mov v1.s[3], w10
- rev w12,w8
- aese v17.16b,v22.16b
- aesmc v17.16b,v17.16b
- mov v18.s[3], w12
- subs x2,x2,#3
- aese v4.16b,v23.16b
- aese v5.16b,v23.16b
- aese v17.16b,v23.16b
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ orr v0.16b,v6.16b,v6.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ orr v1.16b,v6.16b,v6.16b
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ orr v18.16b,v6.16b,v6.16b
+ add w9,w8,#1
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ eor v19.16b,v19.16b,v7.16b
+ rev w9,w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ mov v0.s[3], w9
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ mov v1.s[3], w10
+ rev w12,w8
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ mov v18.s[3], w12
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
- eor v2.16b,v2.16b,v4.16b
- ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
- st1 {v2.16b},[x1],#16
- eor v3.16b,v3.16b,v5.16b
- mov w6,w5
- st1 {v3.16b},[x1],#16
- eor v19.16b,v19.16b,v17.16b
- ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
- st1 {v19.16b},[x1],#16
- b.hs .Loop3x_ctr32
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v19.16b},[x1],#16
+ b.hs .Loop3x_ctr32
- adds x2,x2,#3
- b.eq .Lctr32_done
- cmp x2,#1
- mov x12,#16
+ adds x2,x2,#3
+ b.eq .Lctr32_done
+ cmp x2,#1
+ mov x12,#16
csel x12,xzr,x12,eq
.Lctr32_tail:
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- ld1 {v16.4s},[x7],#16
- subs w6,w6,#2
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v17.4s},[x7],#16
- b.gt .Lctr32_tail
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lctr32_tail
- aese v0.16b,v16.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v16.16b
- aesmc v1.16b,v1.16b
- aese v0.16b,v17.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v17.16b
- aesmc v1.16b,v1.16b
- ld1 {v2.16b},[x0],x12
- aese v0.16b,v20.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v20.16b
- aesmc v1.16b,v1.16b
- ld1 {v3.16b},[x0]
- aese v0.16b,v21.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v21.16b
- aesmc v1.16b,v1.16b
- eor v2.16b,v2.16b,v7.16b
- aese v0.16b,v22.16b
- aesmc v0.16b,v0.16b
- aese v1.16b,v22.16b
- aesmc v1.16b,v1.16b
- eor v3.16b,v3.16b,v7.16b
- aese v0.16b,v23.16b
- aese v1.16b,v23.16b
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
- cmp x2,#1
- eor v2.16b,v2.16b,v0.16b
- eor v3.16b,v3.16b,v1.16b
- st1 {v2.16b},[x1],#16
- b.eq .Lctr32_done
- st1 {v3.16b},[x1]
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq .Lctr32_done
+ st1 {v3.16b},[x1]
.Lctr32_done:
- ldr x29,[sp],#16
+ ldr x29,[sp],#16
ret
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif
Added: projects/openssl111/secure/lib/libcrypto/aarch64/armv8-mont.S
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/openssl111/secure/lib/libcrypto/aarch64/armv8-mont.S Sat Sep 22 02:23:03 2018 (r338875)
@@ -0,0 +1,1406 @@
+/* $FreeBSD$ */
+/* Do not modify. This file is auto-generated from armv8-mont.pl. */
+.text
+
+.globl bn_mul_mont
+.type bn_mul_mont,%function
+.align 5
+bn_mul_mont:
+ tst x5,#7
+ b.eq __bn_sqr8x_mont
+ tst x5,#3
+ b.eq __bn_mul4x_mont
+.Lmul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ ldr x9,[x2],#8 // bp[0]
+ sub x22,sp,x5,lsl#3
+ ldp x7,x8,[x1],#16 // ap[0..1]
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ and x22,x22,#-16 // ABI says so
+ ldp x13,x14,[x3],#16 // np[0..1]
+
+ mul x6,x7,x9 // ap[0]*bp[0]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ mul x10,x8,x9 // ap[1]*bp[0]
+ umulh x11,x8,x9
+
+ mul x15,x6,x4 // "tp[0]"*n0
+ mov sp,x22 // alloca
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6 // discarded
+ // (*) As for removal of first multiplication and addition
+ // instructions. The outcome of first addition is
+ // guaranteed to be zero, which leaves two computationally
+ // significant outcomes: it either carries or not. Then
+ // question is when does it carry? Is there alternative
+ // way to deduce it? If you follow operations, you can
+ // observe that condition for carry is quite simple:
+ // x6 being non-zero. So that carry can be calculated
+ // by adding -1 to x6. That's what next instruction does.
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ adc x13,x13,xzr
+ cbz x21,.L1st_skip
+
+.L1st:
+ ldr x8,[x1],#8
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ ldr x14,[x3],#8
+ adds x12,x16,x13
+ mul x10,x8,x9 // ap[j]*bp[0]
+ adc x13,x17,xzr
+ umulh x11,x8,x9
+
+ adds x12,x12,x6
+ mul x16,x14,x15 // np[j]*m1
+ adc x13,x13,xzr
+ umulh x17,x14,x15
+ str x12,[x22],#8 // tp[j-1]
+ cbnz x21,.L1st
+
+.L1st_skip:
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adc x13,x17,xzr
+
+ adds x12,x12,x6
+ sub x20,x5,#8 // i=num-1
+ adcs x13,x13,x7
+
+ adc x19,xzr,xzr // upmost overflow bit
+ stp x12,x13,[x22]
+
+.Louter:
+ ldr x9,[x2],#8 // bp[i]
+ ldp x7,x8,[x1],#16
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+
+ mul x6,x7,x9 // ap[0]*bp[i]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ ldp x13,x14,[x3],#16
+ mul x10,x8,x9 // ap[1]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x15,x6,x4
+ sub x20,x20,#8 // i--
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ cbz x21,.Linner_skip
+
+.Linner:
+ ldr x8,[x1],#8
+ adc x13,x13,xzr
+ ldr x23,[x22],#8 // tp[j]
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ ldr x14,[x3],#8
+ adc x13,x17,xzr
+
+ mul x10,x8,x9 // ap[j]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x16,x14,x15 // np[j]*m1
+ adds x12,x12,x6
+ umulh x17,x14,x15
+ str x12,[x22,#-16] // tp[j-1]
+ cbnz x21,.Linner
+
+.Linner_skip:
+ ldr x23,[x22],#8 // tp[j]
+ adc x13,x13,xzr
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adcs x13,x17,x19
+ adc x19,xzr,xzr
+
+ adds x6,x6,x23
+ adc x7,x7,xzr
+
+ adds x12,x12,x6
+ adcs x13,x13,x7
+ adc x19,x19,xzr // upmost overflow bit
+ stp x12,x13,[x22,#-16]
+
+ cbnz x20,.Louter
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x14,[x3],#8 // np[0]
+ subs x21,x5,#8 // j=num-1 and clear borrow
+ mov x1,x0
+.Lsub:
+ sbcs x8,x23,x14 // tp[j]-np[j]
+ ldr x23,[x22],#8
+ sub x21,x21,#8 // j--
+ ldr x14,[x3],#8
+ str x8,[x1],#8 // rp[j]=tp[j]-np[j]
+ cbnz x21,.Lsub
+
+ sbcs x8,x23,x14
+ sbcs x19,x19,xzr // did it borrow?
+ str x8,[x1],#8 // rp[num-1]
+
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x8,[x0],#8 // rp[0]
+ sub x5,x5,#8 // num--
+ nop
+.Lcond_copy:
+ sub x5,x5,#8 // num--
+ csel x14,x23,x8,lo // did it borrow?
+ ldr x23,[x22],#8
+ ldr x8,[x0],#8
+ str xzr,[x22,#-16] // wipe tp
+ str x14,[x0,#-16]
+ cbnz x5,.Lcond_copy
+
+ csel x14,x23,x8,lo
+ str xzr,[x22,#-8] // wipe tp
+ str x14,[x0,#-8]
+
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+.type __bn_sqr8x_mont,%function
+.align 5
+__bn_sqr8x_mont:
+ cmp x1,x2
+ b.ne __bn_mul4x_mont
+.Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x3,[sp,#96] // offload rp and np
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ ldp x12,x13,[x1,#8*6]
+
+ sub x2,sp,x5,lsl#4
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ mov sp,x2 // alloca
+ sub x27,x5,#8*8
+ b .Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+ sub x27,x27,#8*8
+ stp xzr,xzr,[x2,#8*0]
+ stp xzr,xzr,[x2,#8*2]
+ stp xzr,xzr,[x2,#8*4]
+ stp xzr,xzr,[x2,#8*6]
+.Lsqr8x_zero_start:
+ stp xzr,xzr,[x2,#8*8]
+ stp xzr,xzr,[x2,#8*10]
+ stp xzr,xzr,[x2,#8*12]
+ stp xzr,xzr,[x2,#8*14]
+ add x2,x2,#8*16
+ cbnz x27,.Lsqr8x_zero
+
+ add x3,x1,x5
+ add x1,x1,#8*8
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ mov x23,xzr
+ mov x24,xzr
+ mov x25,xzr
+ mov x26,xzr
+ mov x2,sp
+ str x4,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+.Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
+ mul x15,x8,x6
+ mul x16,x9,x6
+ mul x17,x10,x6
+ adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
+ mul x14,x11,x6
+ adcs x21,x21,x15
+ mul x15,x12,x6
+ adcs x22,x22,x16
+ mul x16,x13,x6
+ adcs x23,x23,x17
+ umulh x17,x7,x6 // hi(a[1..7]*a[0])
+ adcs x24,x24,x14
+ umulh x14,x8,x6
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-projects
mailing list