git: c1135b2b54bf - main - lib/libmd: import aarch64 md5 SIMD implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 24 Oct 2025 10:18:50 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=c1135b2b54bf46709120d98c90ff4d28a77b896c

commit c1135b2b54bf46709120d98c90ff4d28a77b896c
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2025-10-10 17:45:45 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-10-24 10:17:11 +0000

    lib/libmd: import aarch64 md5 SIMD implementation
    
    Reviewed by:    andrew, imp
    Approved by:    markj (mentor)
    Differential Revision:  https://reviews.freebsd.org/D45670
    MFC after:      1 month
---
 lib/libmd/aarch64/md5block.S | 206 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)

diff --git a/lib/libmd/aarch64/md5block.S b/lib/libmd/aarch64/md5block.S
new file mode 100644
index 000000000000..b928c8dd795a
--- /dev/null
+++ b/lib/libmd/aarch64/md5block.S
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <sys/elf_common.h>
+#include <machine/asm.h>
+
+# optimal instruction sequence for k = \key + \m
+.macro	addkm	key, m
+.if 0x100000000 - \key > 0x00ffffff
+	movz	k, #\key & 0xffff
+	movk	k, #\key >> 16, lsl #16
+	add	k, k, \m
+.elseif 0x100000000 - \key > 0x0000ffff
+	sub	k, \m, #(0x100000000 - \key) & 0xfff000
+	sub	k, k, #(0x100000000 - \key) & 0xfff
+.else
+	movz	k, #0x100000000 - \key
+	sub	k, \m, k
+.endif
+.endm
+
+.macro	round	a, b, c, d, f, key, m, s
+	\f	f, \b, \c, \d
+	addkm	\key, \m		// k[i] + m[g]
+	add	\a, \a, k		// k[i] + m[g] + a
+	add	\a, \a, f		// k[i] + m[g] + a + f
+	ror	\a, \a, #32-\s
+	add	\a, \a, \b
+.endm
+
+	/* f = b ? c : d */
+.macro	f0	f, b, c, d
+	eor	\f, \c, \d
+	and	\f, \f, \b
+	eor	\f, \f, \d
+.endm
+
+	/*
+	 * special cased round 1 function
+	 * f1 = d ? b : c = (d & b) + (~d & c)
+	 */
+.macro	round1	a, b, c, d, key, m, s
+	bic	tmp, \c, \d		// ~d & c
+	addkm	\key, \m		// k[i] + m[g]
+	add	\a, \a, k		// k[i] + m[g] + a
+	and	f, \b, \d		// d & b
+	add	\a, \a, tmp		// k[i] + m[g] + a + (~d & c)
+	add	\a, \a, f		// k[i] + m[g] + a + (~d & c) + (d & b)
+	ror	\a, \a, #32-\s
+	add	\a, \a, \b
+.endm
+
+	/* f = b ^ c ^ d */
+.macro	f2	f, b, c, d
+	eor	\f, \c, \d
+	eor	\f, \f, \b
+.endm
+
+	/* f = c ^ (b | ~d) */
+.macro	f3	f, b, c, d
+	orn	\f, \b, \d
+	eor	\f, \f, \c
+.endm
+
+	/* do 4 rounds */
+.macro	rounds	f, m0, m1, m2, m3, s0, s1, s2, s3, k0, k1, k2, k3
+	round	a, b, c, d, \f, \k0, \m0, \s0
+	round	d, a, b, c, \f, \k1, \m1, \s1
+	round	c, d, a, b, \f, \k2, \m2, \s2
+	round	b, c, d, a, \f, \k3, \m3, \s3
+.endm
+
+	/* do 4 rounds with f0, f1, f2, f3 */
+.macro	rounds0	m0, m1, m2, m3, k0, k1, k2, k3
+	rounds	f0, \m0, \m1, \m2, \m3, 7, 12, 17, 22, \k0, \k1, \k2, \k3
+.endm
+
+.macro	rounds1	m0, m1, m2, m3, k0, k1, k2, k3
+	round1	a, b, c, d, \k0, \m0,  5
+	round1	d, a, b, c, \k1, \m1,  9
+	round1	c, d, a, b, \k2, \m2, 14
+	round1	b, c, d, a, \k3, \m3, 20
+.endm
+
+.macro	rounds2	m0, m1, m2, m3, k0, k1, k2, k3
+	rounds	f2, \m0, \m1, \m2, \m3, 4, 11, 16, 23, \k0, \k1, \k2, \k3
+.endm
+
+.macro	rounds3	m0, m1, m2, m3, k0, k1, k2, k3
+	rounds	f3, \m0, \m1, \m2, \m3, 6, 10, 15, 21, \k0, \k1, \k2, \k3
+.endm
+
+	/* md5block(MD5_CTX, buf, len) */
+ENTRY(_libmd_md5block)
+ctx	.req	x0
+buf	.req	x1
+len	.req	x2
+end	.req	x2			// aliases len
+a	.req	w3
+b	.req	w4
+c	.req	w5
+d	.req	w6
+f	.req	w7
+tmp	.req	w8
+k	.req	w9
+m0	.req	w10
+m1	.req	w11
+m2	.req	w12
+m3	.req	w13
+m4	.req	w14
+m5	.req	w15
+m6	.req	w16
+m7	.req	w17
+					// x18 is the platform register
+m8	.req	w19
+m9	.req	w20
+m10	.req	w21
+m11	.req	w22
+m12	.req	w23
+m13	.req	w24
+m14	.req	w25
+m15	.req	w26
+
+a_	.req	m0
+b_	.req	m7
+c_	.req	m14
+d_	.req	m5
+
+	stp	x19, x20, [sp, #-0x40]!
+	stp	x21, x22, [sp, #0x10]
+	stp	x23, x24, [sp, #0x20]
+	stp	x25, x26, [sp, #0x30]
+
+	bics	len, len, #63		// length in blocks
+	add	end, buf, len		// end pointer
+
+	beq	.Lend			// was len == 0 after BICS?
+
+	ldp	a, b, [ctx, #0]
+	ldp	c, d, [ctx, #8]
+
+	/* first eight rounds interleaved with data loads */
+.Lloop:	ldp	m0, m1, [buf, #0]
+	round	a, b, c, d, f0, 0xd76aa478, m0,  7
+	ldp	m2, m3, [buf, #8]
+	round	d, a, b, c, f0, 0xe8c7b756, m1, 12
+	ldp	m4, m5, [buf, #16]
+	round	c, d, a, b, f0, 0x242070db, m2, 17
+	ldp	m6, m7, [buf, #24]
+	round	b, c, d, a, f0, 0xc1bdceee, m3, 22
+
+	ldp	m8, m9, [buf, #32]
+	round	a, b, c, d, f0, 0xf57c0faf, m4,  7
+	ldp	m10, m11, [buf, #40]
+	round	d, a, b, c, f0, 0x4787c62a, m5, 12
+	ldp	m12, m13, [buf, #48]
+	round	c, d, a, b, f0, 0xa8304613, m6, 17
+	ldp	m14, m15, [buf, #56]
+	round	b, c, d, a, f0, 0xfd469501, m7, 22
+
+	/* remaining rounds use the roundsX macros */
+	rounds0	 m8,  m9, m10, m11, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+	rounds0	m12, m13, m14, m15, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+
+	rounds1	 m1,  m6, m11,  m0, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+	rounds1	 m5, m10, m15,  m4, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+	rounds1	 m9, m14,  m3,  m8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+	rounds1	m13,  m2,  m7, m12, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+
+	rounds2	 m5,  m8, m11, m14, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+	rounds2	 m1,  m4,  m7, m10, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+	rounds2	m13,  m0,  m3,  m6, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+	rounds2	 m9, m12, m15,  m2, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+
+	rounds3	 m0,  m7, m14,  m5, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+	rounds3	m12,  m3, m10,  m1, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+	rounds3	 m8, m15,  m6, m13, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+	rounds3	 m4, m11,  m2,  m9, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+
+	ldp	a_, b_, [ctx, #0]
+	ldp	c_, d_, [ctx, #8]
+	add	a, a, a_
+	add	b, b, b_
+	add	c, c, c_
+	add	d, d, d_
+	stp	a, b, [ctx, #0]
+	stp	c, d, [ctx, #8]
+
+	add	buf, buf, #64
+	cmp	buf, end
+	bne	.Lloop
+
+.Lend:	ldp	x25, x26, [sp, #0x30]
+	ldp	x23, x24, [sp, #0x20]
+	ldp	x21, x22, [sp, #0x10]
+	ldp	x19, x20, [sp], #0x40
+
+	ret
+END(_libmd_md5block)
+
+GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL)
+
+	.section .note.GNU-stack,"",%progbits