git: 3863fec1ce2d - main - lib/libc/aarch64/string: add strlen SIMD implementation
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 10 Jan 2025 15:04:00 UTC
The branch main has been updated by fuz:
URL: https://cgit.FreeBSD.org/src/commit/?id=3863fec1ce2dc6033f094a085118605ea89db9e2
commit 3863fec1ce2dc6033f094a085118605ea89db9e2
Author: Getz Mikalsen <getz@FreeBSD.org>
AuthorDate: 2024-08-26 19:54:32 +0000
Commit: Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-01-10 15:02:40 +0000
lib/libc/aarch64/string: add strlen SIMD implementation
Adds a SIMD enhanced strlen for Aarch64. It takes inspiration from
the amd64 implementation but I struggled getting the performance I
had hoped for on cores like the Graviton3 when compared to the
existing implementation from Arm Optimized Routines.
See the DR for bechmark results.
Tested by: fuz (exprun)
Reviewed by: fuz, emaste
Sponsored by: Google LLC (GSoC 2024)
PR: 281175
Differential Revision: https://reviews.freebsd.org/D45623
---
lib/libc/aarch64/string/Makefile.inc | 4 ++--
lib/libc/aarch64/string/strlen.S | 46 ++++++++++++++++++++++++++++++++++++
2 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
index f8c67319fe12..7325b54d9716 100644
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -14,7 +14,6 @@ AARCH64_STRING_FUNCS= \
strchr \
strchrnul \
strcpy \
- strlen \
strnlen \
strrchr
@@ -30,7 +29,8 @@ MDSRCS+= \
strncmp.S \
memccpy.S \
strncat.c \
- strlcat.c
+ strlcat.c \
+ strlen.S
#
# Add the above functions. Generate an asm file that includes the needed
diff --git a/lib/libc/aarch64/string/strlen.S b/lib/libc/aarch64/string/strlen.S
new file mode 100644
index 000000000000..7bfac7f4b1e1
--- /dev/null
+++ b/lib/libc/aarch64/string/strlen.S
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+
+ .weak strlen
+ .set strlen, __strlen
+ .text
+
+ENTRY(__strlen)
+ bic x10, x0, #0xf // aligned src
+ and x9, x0, #0xf
+ ldr q0, [x10]
+ cmeq v0.16b, v0.16b, #0
+ shrn v0.8b, v0.8h, #4
+ fmov x1, d0
+ cbz x9, .Laligned
+ lsl x2, x0, #2 // get the byte offset
+ lsr x1, x1, x2 // shift by offset index
+ cbz x1, .Lloop
+ rbit x1, x1
+ clz x0, x1
+ lsr x0, x0, #2
+ ret
+
+.Laligned:
+ cbnz x1, .Ldone
+
+.Lloop:
+ ldr q0, [x10, #16]!
+ cmeq v0.16b, v0.16b, #0
+ shrn v0.8b, v0.8h, #4 // reduce to fit mask in GPR
+ fcmp d0, #0.0
+ b.eq .Lloop
+ fmov x1, d0
+.Ldone:
+ sub x0, x10, x0
+ rbit x1, x1 // reverse bits as NEON has no ctz
+ clz x3, x1
+ lsr x3, x3, #2
+ add x0, x0, x3
+ ret
+END(__strlen)