git: 5a52f0704435 - main - libc: scalar strnlen() in RISC-V assembly

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 31 Oct 2025 12:48:38 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=5a52f0704435b089199201be0029e0d7c9ef2fce

commit 5a52f0704435b089199201be0029e0d7c9ef2fce
Author:     Strahinja Stanišić <strajabot@FreeBSD.org>
AuthorDate: 2024-08-04 15:12:00 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-10-31 12:47:59 +0000

    libc: scalar strnlen() in RISC-V assembly
    
    Optimized implementation of strnlen() in RISC-V assembly
    
    Performance was measured using strperf on a HiFive Unmatched (SiFive HF105-001) board.
    
    os: FreeBSD
    arch: riscv
            │ strnlen_baseline │           strnlen_scalar            │
            │      sec/op      │   sec/op     vs base                │
    Short          787.0µ ± 0%   430.9µ ± 1%  -45.24% (p=0.000 n=20)
    Mid            621.6µ ± 0%   195.1µ ± 1%  -68.61% (p=0.000 n=20)
    Long           569.4µ ± 1%   100.6µ ± 0%  -82.34% (p=0.000 n=20)
    geomean        653.1µ        203.7µ       -68.81%
    
            │ strnlen_baseline │            strnlen_scalar            │
            │      MiB/s       │    MiB/s     vs base                 │
    Short           158.8 ± 0%    290.1 ± 1%   +82.62% (p=0.000 n=20)
    Mid             201.1 ± 0%    640.6 ± 1%  +218.59% (p=0.000 n=20)
    Long            219.5 ± 1%   1242.9 ± 0%  +466.19% (p=0.000 n=20)
    geomean         191.4         613.5       +220.57%
    
    MFC after:      1 month
    MFC to:         stable/15
    Approved by:    mhorne, markj (mentor)
    Reviewed by:    fuz, Jari Sihvola <jsihv@gmx.com>
    Sponsored by:   Google LLC (GSoC 2024)
    Differential Revision:  https://reviews.freebsd.org/D46230
---
 lib/libc/riscv/string/Makefile.inc |   1 +
 lib/libc/riscv/string/strnlen.S    | 143 +++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)

diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc
index ebea8d1d3412..4b97490a5494 100644
--- a/lib/libc/riscv/string/Makefile.inc
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -3,4 +3,5 @@ MDSRCS+= \
 	memcpy.S \
 	memset.S \
 	strlen.S \
+	strnlen.S \
 	strrchr.S
diff --git a/lib/libc/riscv/string/strnlen.S b/lib/libc/riscv/string/strnlen.S
new file mode 100644
index 000000000000..c0fd959548ff
--- /dev/null
+++ b/lib/libc/riscv/string/strnlen.S
@@ -0,0 +1,143 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - const char *s
+ * a1 - size_t maxlen;
+ */
+ENTRY(strnlen)
+	/*
+	 * a0 - const char *s;
+	 * a1 - size_t maxlen;
+	 * a2 - uint64_t *ptr;
+	 * a3 - char iter[8];
+	 * a4 - uint64_t *end_align;
+	 * a5 - uint64_t *end_unroll;
+	 */
+
+	beqz a1, .Lnot_found
+
+	/* ptr = s & ~0b111 */
+	/* t0 = 0x0101010101010101 */
+	/* t1 = 0x8080808080808080 */
+	/* end_align = (s + maxlen + 7) & ~0b111 */
+	/* mask_start = t0 >> ((-s.value) << 3) */
+	add a4, a0, a1
+	li t0, 0x01010101
+	addi a4, a4, 7
+	slli t1, t0, 32
+	neg t2, a0
+	andi a4, a4, ~0b111
+	or t0, t0, t1
+	slli t2, t2, 3
+	andi a2, a0, ~0b111
+	slli t1, t0, 7
+	srl t2, t0, t2
+
+	/* if pointer is aligned skip to loop */
+	beq a0, a2, .Lskip_start
+
+	/* iter = *ptr */
+	ld a3, (a2)
+
+	/* iter = iter | mask_start */
+	or a3, a3, t2
+
+	/* has_zero */
+	not t2, a3
+	sub a3, a3, t0
+	and t2, t2, t1
+	and a3, a3, t2
+
+	addi a2, a2, 8
+	bnez a3, .Lfind_zero
+
+.Lskip_start:
+	/* end_unroll */
+	sub t2, a4, a2
+	andi t2, t2, ~0b1111
+	add a5, a2, t2
+
+	/* while (ptr != end_unroll) */
+	beq a2, a5, .Lskip_loop
+.Lloop:
+	ld a3, (a2)
+	ld a6, 8(a2)
+
+	/* has_zero */
+	not t2, a3
+	not t3, a6
+	sub a3, a3, t0
+	sub a6, a6, t0
+	and t2, t2, t1
+	and t3, t3, t1
+	and a3, a3, t2
+	and a6, a6, t3
+
+	addi a2, a2, 8
+	bnez a3, .Lfind_zero
+
+	mv a3, a6
+
+	addi a2, a2, 8
+	bnez a3, .Lfind_zero
+
+	bne a2, a5, .Lloop
+
+.Lskip_loop:
+
+	beq a2, a4, .Lnot_found
+
+	ld a3, (a2)
+
+	/* has_zero */
+	not t2, a3
+	sub a3, a3, t0
+	and t2, t2, t1
+	and a3, a3, t2
+
+
+	addi a2, a2, 8
+	beqz a3, .Lnot_found
+
+.Lfind_zero:
+
+	/* move ptr back */
+	addi a2, a2, -8
+
+	/* isolate lowest set bit */
+	neg t0, a3
+	and a3, a3, t0
+
+	li t0, 0x0001020304050607
+	srli a3, a3, 7
+
+	/* lowest set bit is 2^(8*k)
+	 * multiplying by it shifts the idx array in t0 by k bytes to the left */
+	mul	a3, a3, t0
+
+	/* highest byte contains idx of first zero */
+	srli a3, a3, 56
+
+	/* zero_idx */
+	sub a2, a2, a0
+	add a2, a2, a3
+
+	/* min(zero_idx, maxlen) */
+	sub a2, a2, a1
+	srai t1, a2, 63
+	and a2, a2, t1
+	add a0, a1, a2
+
+	ret
+
+.Lnot_found:
+	mv a0, a1
+	ret
+
+END(strnlen)