git: 08af0bbc9c7d - main - libc: scalar strchrnul() in RISC-V assembly

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 31 Oct 2025 12:48:40 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=08af0bbc9c7d71bbaadb31ad31f8492f40537c5c

commit 08af0bbc9c7d71bbaadb31ad31f8492f40537c5c
Author:     Strahinja Stanišić <strajabot@FreeBSD.org>
AuthorDate: 2024-07-19 17:58:04 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-10-31 12:48:00 +0000

    libc: scalar strchrnul() in RISC-V assembly
    
    Scalar implementation of strchrnul() in RISC-V assembly and changes to the
    corresponding manpage.
    
    Performance was benchmarked on a HiFive Unmatched (SiFive HF105-001) board
    using: https://github.com/clausecker/strperf
    
    os: FreeBSD
    arch: riscv
            │ strchrnul_baseline │          strchrnul_scalar           │
            │       sec/op       │   sec/op     vs base                │
    Short            680.2µ ± 5%   435.3µ ± 0%  -36.01% (p=0.000 n=20)
    Mid              314.7µ ± 3%   221.4µ ± 0%  -29.63% (p=0.000 n=20)
    Long             152.3µ ± 0%   138.5µ ± 0%   -9.08% (p=0.000 n=20)
    geomean          319.5µ        237.2µ       -25.75%
    
            │ strchrnul_baseline │          strchrnul_scalar          │
            │       MiB/s        │   MiB/s     vs base                │
    Short             183.8 ± 5%   287.2 ± 0%  +56.27% (p=0.000 n=20)
    Mid               397.3 ± 3%   564.6 ± 0%  +42.12% (p=0.000 n=20)
    Long              820.5 ± 0%   902.5 ± 0%   +9.99% (p=0.000 n=20)
    geomean           391.3        527.0       +34.68%
    
    MFC after:      1 month
    MFC to:         stable/15
    Approved by:    markj (mentor)
    Reviewed by:    fuz
    Sponsored by:   Google LLC (GSoC 2024)
    Differential Revision:  https://reviews.freebsd.org/D46047
---
 lib/libc/riscv/string/Makefile.inc |   1 +
 lib/libc/riscv/string/strchrnul.S  | 116 +++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc
index 4b97490a5494..719f22f6077f 100644
--- a/lib/libc/riscv/string/Makefile.inc
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -4,4 +4,5 @@ MDSRCS+= \
 	memset.S \
 	strlen.S \
 	strnlen.S \
+	strchrnul.S \
 	strrchr.S
diff --git a/lib/libc/riscv/string/strchrnul.S b/lib/libc/riscv/string/strchrnul.S
new file mode 100644
index 000000000000..8abba71c4199
--- /dev/null
+++ b/lib/libc/riscv/string/strchrnul.S
@@ -0,0 +1,116 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+        .weak   strchrnul
+        .set    strchrnul, __strchrnul
+
+/*
+ * a0 - const char *str
+ * a1 - int c;
+ */
+ENTRY(__strchrnul)
+	/*
+	 * a0 - const char *ptr;
+	 * a1 - char cccccccc[8];
+	 * a2 - char iter[8];
+	 * a3 - char mask_end
+	 */
+
+	/* int to char */
+	andi a1, a1, 0xFF
+
+	/* t0 = 0x0101010101010101 */
+	li t0, 0x01010101
+	slli t1, t0, 32
+	or t0, t0, t1
+
+	/* t1 = 0x8080808080808080 */
+	slli t1, t0, 7
+
+	/* spread char across bytes */
+	mul a1, a1, t0
+
+	/* align_offset */
+	andi t2, a0, 0b111
+
+	/* align pointer */
+	andi a0, a0, ~0b111
+
+	/* if pointer is aligned skip to loop */
+	beqz t2, .Lloop
+
+	ld a2, (a0)
+
+	/* mask_start calculation */
+	slli t2, t2, 3
+	neg t2, t2
+	srl t2, t0, t2
+
+	/* fill bytes before start with non-zero */
+	or a3, a2, t2
+
+	xor a2, a2, a1
+	or a2, a2, t2
+
+	/* has_zero for \0 */
+	not t3, a3
+	not t2, a2
+	sub a3, a3, t0
+	sub a2, a2, t0
+	and a3, a3, t3
+	and a2, a2, t2
+	and a3, a3, t1
+	and a2, a2, t1
+
+
+	/* if \0 or c was found, exit */
+	or a2, a2, a3
+	addi a0, a0, 8
+	bnez a2, .Lfind_char
+
+
+.Lloop:
+	ld a2, (a0)
+
+	/* has_zero for both \0 or c */
+	xor a3, a2, a1
+
+	not t2, a2
+	not t3, a3
+	sub a2, a2, t0
+	sub a3, a3, t0
+	and a2, a2, t2
+	and a3, a3, t3
+	and a2, a2, t1
+	and a3, a3, t1
+
+	/* if \0 or c was found, exit */
+	or a2, a2, a3
+	addi a0, a0, 8
+	beqz a2, .Lloop
+
+.Lfind_char:
+	addi a0, a0, -8
+
+	/* isolate lowest set bit */
+	neg t0, a2
+	and a2, a2, t0
+
+	li t0, 0x0001020304050607
+	srli a2, a2, 7
+
+	/* lowest set bit is 2^(8*k)
+	 * multiplying by it shifts the idx array in t0 by k bytes to the left */
+	mul	a2, a2, t0
+
+	/* highest byte contains idx of first zero */
+	srli a2, a2, 56
+
+	add a0, a0, a2
+	ret
+END(__strchrnul)