git: df21a004be23 - main - libc: scalar strrchr() in RISC-V assembly

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 31 Oct 2025 12:48:27 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=df21a004be237a1dccd03c7b47254625eea62fa9

commit df21a004be237a1dccd03c7b47254625eea62fa9
Author:     Strahinja Stanišić <strajabot@FreeBSD.org>
AuthorDate: 2024-10-24 16:18:07 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-10-31 12:47:57 +0000

    libc: scalar strrchr() in RISC-V assembly
    
    Implements strrchr in RISC-V assembly, leading to the following
    improvements (performance measured on SiFive HF105-001)
    
    os: FreeBSD
    arch: riscv
            │ strrchr_baseline │             strrchr_scalar             │
            │      sec/op      │   sec/op     vs base                   │
    Short          837.2µ ± 1%   574.6µ ± 1%  -31.37% (p=0.000 n=20+21)
    Mid            639.7µ ± 0%   269.7µ ± 0%  -57.84% (p=0.000 n=20+21)
    Long           589.1µ ± 0%   176.7µ ± 0%  -70.01% (p=0.000 n=20+21)
    geomean        680.8µ        301.4µ       -55.73%
    
            │ strrchr_baseline │             strrchr_scalar             │
            │      MiB/s       │   MiB/s     vs base                    │
    Short           149.3 ± 1%   217.6 ± 1%   +45.71% (p=0.000 n=20+21)
    Mid             195.4 ± 0%   463.6 ± 0%  +137.22% (p=0.000 n=20+21)
    Long            212.2 ± 0%   707.4 ± 0%  +233.40% (p=0.000 n=20+21)
    geomean         183.6        414.7       +125.88%
    
    MFC after:      1 month
    MFC to:         stable/15
    Approved by:    mhorne, markj (mentor)
    Sponsored by:   Google LLC (GSoC 2024)
    Differential Revision:  https://reviews.freebsd.org/D47275
---
 lib/libc/riscv/string/Makefile.inc |   2 +
 lib/libc/riscv/string/strrchr.S    | 124 +++++++++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)

diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc
new file mode 100644
index 000000000000..a9cf8bf52481
--- /dev/null
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -0,0 +1,2 @@
+MDSRCS+= \
+	strrchr.S
diff --git a/lib/libc/riscv/string/strrchr.S b/lib/libc/riscv/string/strrchr.S
new file mode 100644
index 000000000000..51f34ca21fac
--- /dev/null
+++ b/lib/libc/riscv/string/strrchr.S
@@ -0,0 +1,124 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - const char *s
+ * a1 - int c
+ */
+ENTRY(strrchr)
+	/*
+	 * a0 - const char *ptr_align
+	 * a1 - temporary
+	 * a2 -	temporary
+	 * a3 - temporary
+	 * a4 -	temporary
+	 * a5 - const char[8] cccccccc
+	 * a6 - const uint64_t *save_align
+	 * a7 - const uint64_t save_iter
+	 * t0 - const uintr64_t REP8_0X01
+	 * t1 - const uintr64_t REP8_0X80
+	 */
+
+	/*
+	 * save_align = 0
+	 * save_iter = 0xFFFFFFFFFFFFFF00
+	 * REP8_0X01 = 0x0101010101010101
+	 * cccccccc = (char)c * REP8_0X01
+	 * REP8_0X80 = (REP8_0X80 << 7) << ((str % 8) * 8)
+	 * ptr_align = str - str % 8
+	 */
+	li t0, 0x01010101
+	li a6, 0
+	slli a2, a0, 3
+	slli t1, t0, 32
+	li a7, 0xFFFFFFFFFFFFFF00
+	or t0, t0, t1
+	andi a1, a1, 0xFF
+	slli t1, t0, 7
+	andi a0, a0, ~0b111
+	mul a5, a1, t0
+	sll t1, t1, a2
+
+.Lloop:					/* do {				*/
+	ld a1, 0(a0)			/* a1 -> data = *ptr_align	*/
+	not a3, a1			/* a3 -> nhz = ~data		*/
+	xor a2, a1, a5			/* a2 -> iter = data ^ cccccccc	*/
+	sub a1, a1, t0			/* a1 -> hz = data - REP8_0X01	*/
+	not a4, a2			/* a4 -> nhc = ~iter		*/
+	and a1, a1, a3			/* hz = hz & nhz		*/
+	sub a3, a2, t0			/* a3 -> hc = iter - REP8_0X01	*/
+	and a1, a1, t1			/* hz = hz & REP8_0X80		*/
+	and a3, a3, a4			/* hc = hc & nhc		*/
+	addi a4, a1, -1			/* a4 -> mask_end = hz - 1	*/
+	and a3, a3, t1			/* hc = hc & REP8_0X80		*/
+	xor a4, a4, a1			/* mask_end = mask_end ^ hz	*/
+	addi a0, a0, 8			/* ptr_align = ptr_align + 8	*/
+	and a3, a3, a4			/* hc = hc & mask_end		*/
+	slli t1, t0, 7			/* REP8_0X80 = REP8_0X01 << 7	*/
+	not a4, a4			/* mask_end = ~mask_end		*/
+
+	beqz a3, .Lskip_save		/* if(!hc) goto skip_save	*/
+	or a2, a2, a4			/* iter = iter | mask_end	*/
+	addi a6, a0, -8			/* save_align = ptr_align - 8	*/
+	mv a7, a2			/* save_iter = iter		*/
+
+.Lskip_save:
+	beqz a1, .Lloop			/* } while(!hz)			*/
+
+.Lfind_char:
+	/*
+	 * a1 -> iter = save_iter
+	 * a2 -> mask_iter = 0xFF00000000000000
+	 * a3 -> match_off = 7
+	 */
+	li a2, 0xFF
+	mv a1, a7
+	slli a2, a2, 56
+	li a3, 7
+
+	and a0, a1, a2
+	srli a2, a2, 8
+	beqz a0, .Lret
+
+	addi a3, a3, -1
+	and a0, a1, a2
+	srli a2, a2, 8
+	beqz a0, .Lret
+
+	addi a3, a3, -1
+	and a0, a1, a2
+	srli a2, a2, 8
+	beqz a0, .Lret
+
+	addi a3, a3, -1
+	and a0, a1, a2
+	srli a2, a2, 8
+	beqz a0, .Lret
+
+	addi a3, a3, -1
+	and a0, a1, a2
+	srli a2, a2, 8
+	beqz a0, .Lret
+
+	addi a3, a3, -1
+	and a0, a1, a2
+	srli a2, a2, 8
+	beqz a0, .Lret
+
+	addi a3, a3, -1
+	and a0, a1, a2
+	srli a2, a2, 8
+	beqz a0, .Lret
+
+	addi a3, a3, -1
+
+.Lret:
+	/* return save_align + match_offset */
+	add a0, a6, a3
+	ret
+END(strrchr)