git: f6210541f9e3 - main - lib/libmd: add optimised SHA1 implementations for aarch64
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 14 May 2025 23:40:39 UTC
The branch main has been updated by fuz:
URL: https://cgit.FreeBSD.org/src/commit/?id=f6210541f9e3c6cfda321e0ad98f277fb98a625b
commit f6210541f9e3c6cfda321e0ad98f277fb98a625b
Author: Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2025-05-14 19:18:12 +0000
Commit: Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-05-14 23:39:58 +0000
lib/libmd: add optimised SHA1 implementations for aarch64
This provides a scalar implementation and one using the SHA1
instruction set extensions.
For the scalar implementation, the w array is kept in registers,
speeding up the whole operations. For a 10 GiB file on my Windows
2023 Dev Kit (ARM Cortex A78C / ARM Cortex X1C):
Performance core:
pre 43.1s (238 MB/s)
generic 41.3s (247 MB/s)
scalar 35.0s (293 MB/s)
sha1 12.8s (800 MB/s)
Efficiency core:
pre 54.2s (189 MB/s)
generic 55.9s (183 MB/s)
scalar 43.0s (238 MB/s)
sha1 16.2s (632 MB/s)
Reviewed by: getz
Differential Revision: https://reviews.freebsd.org/D45444
---
lib/libmd/aarch64/sha1block.S | 490 +++++++++++++++++++++++++++++++++++++++
lib/libmd/aarch64/sha1dispatch.c | 24 ++
2 files changed, 514 insertions(+)
diff --git a/lib/libmd/aarch64/sha1block.S b/lib/libmd/aarch64/sha1block.S
new file mode 100644
index 000000000000..56a0297efadd
--- /dev/null
+++ b/lib/libmd/aarch64/sha1block.S
@@ -0,0 +1,490 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * sha1block_sha1 implementation based on sha1-arm.c,
+ * written and placed in public domain by Jeffrey Walton
+ * based on code from ARM, and by Johannes Schneiders, Skip
+ * Hovsmith and Barry O'Rourke for the mbedTLS project.
+ */
+
+#include <machine/asm.h>
+
+/*
+ * Scalar SHA1 implementation.
+ *
+ * Due to the ample register file available on AArch64, the w array is
+ * kept entirely in registers. The saved a-e variables are instead kept
+ * in memory as we don't have that much memory.
+ */
+
+ // sha1block(SHA1_CTX, buf, len)
+ENTRY(_libmd_sha1block_scalar)
+ctx .req x0
+buf .req x1
+len .req x2
+w .req sp
+a .req w3
+b .req w4
+c .req w5
+d .req w6
+e .req w7
+k .req w8
+f .req w9
+tmp .req w10
+w_0 .req w11
+w_1 .req w12
+w_2 .req w13
+w_3 .req w14
+w_4 .req w15
+w_5 .req w16
+w_6 .req w17
+// w18 is the platform register
+w_7 .req w19
+w_8 .req w20
+w_9 .req w21
+w_10 .req w22
+w_11 .req w23
+w_12 .req w24
+w_13 .req w25
+w_14 .req w26
+w_15 .req w27
+
+.macro shuffle w_i, w_i3, w_i8, w_i14
+ eor \w_i, \w_i, \w_i3
+ eor tmp, \w_i8, \w_i14
+ eor \w_i, \w_i, tmp // w[i-16] ^ w[i-14] ^ w[i-8] ^ w[i-3]
+ ror \w_i, \w_i, #31 // w[i] = ... ror #31
+.endm
+
+.macro func1 a, b, c, d, e
+ and f, \c, \b
+ bic tmp, \d, \b
+ orr f, f, tmp
+.endm
+
+.macro func2 a, b, c, d, e
+ eor f, \b, \c
+ eor f, f, \d
+.endm
+
+.macro func3 a, b, c, d, e
+ eor tmp, \b, \c
+ and f, \b, \c
+ and tmp, tmp, \d
+ orr f, f, tmp
+.endm
+
+.macro func4 a, b, c, d, e
+ func2 \a, \b, \c, \d, \e
+.endm
+
+.macro mix a, b, c, d, e, w_i
+ ror \b, \b, #2
+ ror tmp, \a, #27
+ add \e, \e, \w_i
+ add tmp, tmp, k
+ add \e, \e, f
+ add \e, \e, tmp // (a ror 27) + e + f + k + w[i]
+.endm
+
+.macro round1 a, b, c, d, e, w_i
+ func1 \a, \b, \c, \d, \e
+ rev \w_i, \w_i
+ mix \a, \b, \c, \d, \e, \w_i
+.endm
+
+.macro round func, a, b, c, d, e, w_i, w_i3, w_i8, w_i14
+ shuffle \w_i, \w_i3, \w_i8, \w_i14
+ \func \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, \w_i
+.endm
+
+.macro round1x a, b, c, d, e, w_i, w_i3, w_i8, w_i14
+ round func1, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
+.endm
+
+.macro round2 a, b, c, d, e, w_i, w_i3, w_i8, w_i14
+ round func2, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
+.endm
+
+.macro round3 a, b, c, d, e, w_i, w_i3, w_i8, w_i14
+ round func3, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
+.endm
+
+.macro round4 a, b, c, d, e, w_i, w_i3, w_i8, w_i14
+ round func4, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14
+.endm
+
+ ands len, len, #~63 // take length in multiples of block length
+ beq 1f // bail out if input empty
+
+ sub sp, sp, #24+9*8 // allocate stack space
+ str x19, [sp, #24+0*8]
+ stp x20, x21, [sp, #24+1*8]
+ stp x22, x23, [sp, #24+3*8]
+ stp x24, x25, [sp, #24+5*8]
+ stp x26, x27, [sp, #24+7*8]
+
+ ldp a, b, [ctx, #0] // load SHA1 state from context
+ ldp c, d, [ctx, #8]
+ ldr e, [ctx, #16]
+
+0: stp a, b, [sp, #0] // save old SHA1 state
+ stp c, d, [sp, #8]
+ str e, [sp, #16]
+
+ movz k, #0x7999 // round constant 1
+ movk k, #0x5a82, lsl #16
+
+ ldp w_0, w_1, [buf, #0*4]
+ round1 a, b, c, d, e, w_0
+ round1 e, a, b, c, d, w_1
+
+ ldp w_2, w_3, [buf, #2*4]
+ round1 d, e, a, b, c, w_2
+ round1 c, d, e, a, b, w_3
+
+ ldp w_4, w_5, [buf, #4*4]
+ round1 b, c, d, e, a, w_4
+ round1 a, b, c, d, e, w_5
+
+ ldp w_6, w_7, [buf, #6*4]
+ round1 e, a, b, c, d, w_6
+ round1 d, e, a, b, c, w_7
+
+ ldp w_8, w_9, [buf, #8*4]
+ round1 c, d, e, a, b, w_8
+ round1 b, c, d, e, a, w_9
+
+ ldp w_10, w_11, [buf, #10*4]
+ round1 a, b, c, d, e, w_10
+ round1 e, a, b, c, d, w_11
+
+ ldp w_12, w_13, [buf, #12*4]
+ round1 d, e, a, b, c, w_12
+ round1 c, d, e, a, b, w_13
+
+ ldp w_14, w_15, [buf, #14*4]
+ round1 b, c, d, e, a, w_14
+ round1 a, b, c, d, e, w_15
+
+ round1x e, a, b, c, d, w_0, w_13, w_8, w_2
+ round1x d, e, a, b, c, w_1, w_14, w_9, w_3
+ round1x c, d, e, a, b, w_2, w_15, w_10, w_4
+ round1x b, c, d, e, a, w_3, w_0, w_11, w_5
+
+ movz k, #0xeba1 // round constant 2
+ movk k, #0x6ed9, lsl #16
+
+ round2 a, b, c, d, e, w_4, w_1, w_12, w_6
+ round2 e, a, b, c, d, w_5, w_2, w_13, w_7
+ round2 d, e, a, b, c, w_6, w_3, w_14, w_8
+ round2 c, d, e, a, b, w_7, w_4, w_15, w_9
+ round2 b, c, d, e, a, w_8, w_5, w_0, w_10
+
+ round2 a, b, c, d, e, w_9, w_6, w_1, w_11
+ round2 e, a, b, c, d, w_10, w_7, w_2, w_12
+ round2 d, e, a, b, c, w_11, w_8, w_3, w_13
+ round2 c, d, e, a, b, w_12, w_9, w_4, w_14
+ round2 b, c, d, e, a, w_13, w_10, w_5, w_15
+
+ round2 a, b, c, d, e, w_14, w_11, w_6, w_0
+ round2 e, a, b, c, d, w_15, w_12, w_7, w_1
+ round2 d, e, a, b, c, w_0, w_13, w_8, w_2
+ round2 c, d, e, a, b, w_1, w_14, w_9, w_3
+ round2 b, c, d, e, a, w_2, w_15, w_10, w_4
+
+ round2 a, b, c, d, e, w_3, w_0, w_11, w_5
+ round2 e, a, b, c, d, w_4, w_1, w_12, w_6
+ round2 d, e, a, b, c, w_5, w_2, w_13, w_7
+ round2 c, d, e, a, b, w_6, w_3, w_14, w_8
+ round2 b, c, d, e, a, w_7, w_4, w_15, w_9
+
+ movz k, #0xbcdc // round constant 3
+ movk k, #0x8f1b, lsl #16
+
+ round3 a, b, c, d, e, w_8, w_5, w_0, w_10
+ round3 e, a, b, c, d, w_9, w_6, w_1, w_11
+ round3 d, e, a, b, c, w_10, w_7, w_2, w_12
+ round3 c, d, e, a, b, w_11, w_8, w_3, w_13
+ round3 b, c, d, e, a, w_12, w_9, w_4, w_14
+
+ round3 a, b, c, d, e, w_13, w_10, w_5, w_15
+ round3 e, a, b, c, d, w_14, w_11, w_6, w_0
+ round3 d, e, a, b, c, w_15, w_12, w_7, w_1
+ round3 c, d, e, a, b, w_0, w_13, w_8, w_2
+ round3 b, c, d, e, a, w_1, w_14, w_9, w_3
+
+ round3 a, b, c, d, e, w_2, w_15, w_10, w_4
+ round3 e, a, b, c, d, w_3, w_0, w_11, w_5
+ round3 d, e, a, b, c, w_4, w_1, w_12, w_6
+ round3 c, d, e, a, b, w_5, w_2, w_13, w_7
+ round3 b, c, d, e, a, w_6, w_3, w_14, w_8
+
+ round3 a, b, c, d, e, w_7, w_4, w_15, w_9
+ round3 e, a, b, c, d, w_8, w_5, w_0, w_10
+ round3 d, e, a, b, c, w_9, w_6, w_1, w_11
+ round3 c, d, e, a, b, w_10, w_7, w_2, w_12
+ round3 b, c, d, e, a, w_11, w_8, w_3, w_13
+
+ movz k, #0xc1d6 // round constant 4
+ movk k, #0xca62, lsl #16
+
+ round4 a, b, c, d, e, w_12, w_9, w_4, w_14
+ round4 e, a, b, c, d, w_13, w_10, w_5, w_15
+ round4 d, e, a, b, c, w_14, w_11, w_6, w_0
+ round4 c, d, e, a, b, w_15, w_12, w_7, w_1
+ round4 b, c, d, e, a, w_0, w_13, w_8, w_2
+
+ round4 a, b, c, d, e, w_1, w_14, w_9, w_3
+ round4 e, a, b, c, d, w_2, w_15, w_10, w_4
+ round4 d, e, a, b, c, w_3, w_0, w_11, w_5
+ round4 c, d, e, a, b, w_4, w_1, w_12, w_6
+ round4 b, c, d, e, a, w_5, w_2, w_13, w_7
+
+ round4 a, b, c, d, e, w_6, w_3, w_14, w_8
+ round4 e, a, b, c, d, w_7, w_4, w_15, w_9
+ round4 d, e, a, b, c, w_8, w_5, w_0, w_10
+ round4 c, d, e, a, b, w_9, w_6, w_1, w_11
+ round4 b, c, d, e, a, w_10, w_7, w_2, w_12
+
+ round4 a, b, c, d, e, w_11, w_8, w_3, w_13
+ round4 e, a, b, c, d, w_12, w_9, w_4, w_14
+ round4 d, e, a, b, c, w_13, w_10, w_5, w_15
+ round4 c, d, e, a, b, w_14, w_11, w_6, w_0
+ round4 b, c, d, e, a, w_15, w_12, w_7, w_1
+
+ ldp w_0, w_1, [sp, #0] // reload saved SHA1 state
+ ldp w_2, w_3, [sp, #8]
+ ldr w_4, [sp, #16]
+
+ add a, a, w_0
+ add b, b, w_1
+ add c, c, w_2
+ add d, d, w_3
+ add e, e, w_4
+
+ add buf, buf, #64
+ subs len, len, #64
+ bhi 0b
+
+ stp a, b, [ctx, #0] // write updated SHA1 state
+ stp c, d, [ctx, #8]
+ str e, [ctx, #16]
+
+ ldr x19, [sp, #24+0*8]
+ ldp x20, x21, [sp, #24+1*8]
+ ldp x22, x23, [sp, #24+3*8]
+ ldp x24, x25, [sp, #24+5*8]
+ ldp x26, x27, [sp, #24+7*8]
+ add sp, sp, #24+9*8
+
+1: ret
+END(_libmd_sha1block_scalar)
+
+/*
+ * SHA1 implementation using the SHA1 instruction set extension.
+ */
+
+ .arch_extension sha2
+
+ // sha1block(SHA1_CTX, buf, len)
+ENTRY(_libmd_sha1block_sha1)
+ /* ctx, buf, len: same as for sha1block_scalar */
+kaddr .req x3
+abcd .req v0
+abcd_q .req q0 // alias for use with scalar instructions
+abcd_s .req s0
+e0 .req s1
+e0_v .req v1
+e1 .req s2
+abcd_saved .req v3
+e0_saved .req v4
+tmp0 .req v5
+tmp1 .req v6
+msg0 .req v16
+msg1 .req v17
+msg2 .req v18
+msg3 .req v19
+k0 .req v20
+k1 .req v21
+k2 .req v22
+k3 .req v23
+
+ ands len, len, #~63 // take length in multiples of block length
+ beq 1f // bail out if input empty
+
+ ldr abcd_q, [ctx, #0]
+ ldr e0, [ctx, #16]
+
+ adrp kaddr, k1234
+ add kaddr, kaddr, #:lo12:k1234
+ ld4r {k0.4s, k1.4s, k2.4s, k3.4s}, [kaddr]
+
+0: mov abcd_saved.16b, abcd.16b
+ mov e0_saved.16b, e0_v.16b
+
+ ld1 {msg0.4s, msg1.4s, msg2.4s, msg3.4s}, [buf], #64
+ rev32 msg0.16b, msg0.16b
+ rev32 msg1.16b, msg1.16b
+ rev32 msg2.16b, msg2.16b
+ rev32 msg3.16b, msg3.16b
+
+ add tmp0.4s, msg0.4s, k0.4s
+ add tmp1.4s, msg1.4s, k0.4s
+
+ /* rounds 0--3 */
+ sha1h e1, abcd_s
+ sha1c abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg2.4s, k0.4s
+ sha1su0 msg0.4s, msg1.4s, msg2.4s
+
+ /* rounds 4--7 */
+ sha1h e0, abcd_s
+ sha1c abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg3.4s, k0.4s
+ sha1su1 msg0.4s, msg3.4s
+ sha1su0 msg1.4s, msg2.4s, msg3.4s
+
+ /* rounds 8--11 */
+ sha1h e1, abcd_s
+ sha1c abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg0.4s, k0.4s
+ sha1su1 msg1.4s, msg0.4s
+ sha1su0 msg2.4s, msg3.4s, msg0.4s
+
+ /* rounds 12--15 */
+ sha1h e0, abcd_s
+ sha1c abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg1.4s, k1.4s
+ sha1su1 msg2.4s, msg1.4s
+ sha1su0 msg3.4s, msg0.4s, msg1.4s
+
+ /* rounds 16--19 */
+ sha1h e1, abcd_s
+ sha1c abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg2.4s, k1.4s
+ sha1su1 msg3.4s, msg2.4s
+ sha1su0 msg0.4s, msg1.4s, msg2.4s
+
+ /* rounds 20--23 */
+ sha1h e0, abcd_s
+ sha1p abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg3.4s, k1.4s
+ sha1su1 msg0.4s, msg3.4s
+ sha1su0 msg1.4s, msg2.4s, msg3.4s
+
+ /* rounds 24--27 */
+ sha1h e1, abcd_s
+ sha1p abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg0.4s, k1.4s
+ sha1su1 msg1.4s, msg0.4s
+ sha1su0 msg2.4s, msg3.4s, msg0.4s
+
+ /* rounds 28--31 */
+ sha1h e0, abcd_s
+ sha1p abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg1.4s, k1.4s
+ sha1su1 msg2.4s, msg1.4s
+ sha1su0 msg3.4s, msg0.4s, msg1.4s
+
+ /* rounds 32--35 */
+ sha1h e1, abcd_s
+ sha1p abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg2.4s, k2.4s
+ sha1su1 msg3.4s, msg2.4s
+ sha1su0 msg0.4s, msg1.4s, msg2.4s
+
+ /* rounds 36--39 */
+ sha1h e0, abcd_s
+ sha1p abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg3.4s, k2.4s
+ sha1su1 msg0.4s, msg3.4s
+ sha1su0 msg1.4s, msg2.4s, msg3.4s
+
+ /* rounds 40--43 */
+ sha1h e1, abcd_s
+ sha1m abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg0.4s, k2.4s
+ sha1su1 msg1.4s, msg0.4s
+ sha1su0 msg2.4s, msg3.4s, msg0.4s
+
+ /* rounds 44--47 */
+ sha1h e0, abcd_s
+ sha1m abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg1.4s, k2.4s
+ sha1su1 msg2.4s, msg1.4s
+ sha1su0 msg3.4s, msg0.4s, msg1.4s
+
+ /* rounds 48--51 */
+ sha1h e1, abcd_s
+ sha1m abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg2.4s, k2.4s
+ sha1su1 msg3.4s, msg2.4s
+ sha1su0 msg0.4s, msg1.4s, msg2.4s
+
+ /* rounds 52--55 */
+ sha1h e0, abcd_s
+ sha1m abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg3.4s, k3.4s
+ sha1su1 msg0.4s, msg3.4s
+ sha1su0 msg1.4s, msg2.4s, msg3.4s
+
+ /* rounds 56--59 */
+ sha1h e1, abcd_s
+ sha1m abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg0.4s, k3.4s
+ sha1su1 msg1.4s, msg0.4s
+ sha1su0 msg2.4s, msg3.4s, msg0.4s
+
+ /* rounds 60--63 */
+ sha1h e0, abcd_s
+ sha1p abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg1.4s, k3.4s
+ sha1su1 msg2.4s, msg1.4s
+ sha1su0 msg3.4s, msg0.4s, msg1.4s
+
+ /* rounds 64--67 */
+ sha1h e1, abcd_s
+ sha1p abcd_q, e0, tmp0.4s
+ add tmp0.4s, msg2.4s, k3.4s
+ sha1su1 msg3.4s, msg2.4s
+ sha1su0 msg0.4s, msg1.4s, msg2.4s
+
+ /* rounds 68--71 */
+ sha1h e0, abcd_s
+ sha1p abcd_q, e1, tmp1.4s
+ add tmp1.4s, msg3.4s, k3.4s
+ sha1su1 msg0.4s, msg3.4s
+
+ /* rounds 72--75 */
+ sha1h e1, abcd_s
+ sha1p abcd_q, e0, tmp0.4s
+
+ /* rounds 76--79 */
+ sha1h e0, abcd_s
+ sha1p abcd_q, e1, tmp1.4s
+
+ add e0_v.4s, e0_v.4s, e0_saved.4s
+ add abcd.4s, abcd.4s, abcd_saved.4s
+
+ subs len, len, #64
+ bhi 0b
+
+ str abcd_q, [ctx, #0]
+ str e0, [ctx, #16]
+
+1: ret
+END(_libmd_sha1block_sha1)
+
+ .section .rodata
+ .balign 16
+k1234: .4byte 0x5a827999
+ .4byte 0x6ed9eba1
+ .4byte 0x8f1bbcdc
+ .4byte 0xca62c1d6
+ .size k1234, .-k1234
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libmd/aarch64/sha1dispatch.c b/lib/libmd/aarch64/sha1dispatch.c
new file mode 100644
index 000000000000..e34bf0a1a344
--- /dev/null
+++ b/lib/libmd/aarch64/sha1dispatch.c
@@ -0,0 +1,24 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <machine/ifunc.h>
+#include <sha.h>
+#include <sys/auxv.h>
+
+extern void _libmd_sha1block_scalar(SHA1_CTX *, const void *, size_t);
+extern void _libmd_sha1block_sha1(SHA1_CTX *, const void *, size_t);
+
+DEFINE_IFUNC(, void, sha1_block, (SHA1_CTX *, const void *, size_t))
+{
+ unsigned long hwcap = 0;
+
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+
+ if (hwcap & HWCAP_SHA1)
+ return (_libmd_sha1block_sha1);
+ else
+ return (_libmd_sha1block_scalar);
+}