git: 40a958d5850d - main - libc: scalar memset() in RISC-V assembly

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 31 Oct 2025 12:48:31 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=40a958d5850ddda6d863558c8b31572f700d53ca

commit 40a958d5850ddda6d863558c8b31572f700d53ca
Author:     Strahinja Stanišić <strajabot@FreeBSD.org>
AuthorDate: 2024-06-21 15:43:45 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-10-31 12:47:58 +0000

    libc: scalar memset() in RISC-V assembly
    
    Adds scalar implementation of memset for RISC-V
    and updates the relevant manpage
    
    os: FreeBSD
    arch: riscv
            │ ./results/memset/memset_baseline │   ./results/memset/memset_scalar    │
            │              sec/op              │   sec/op     vs base                │
    40                             527.5µ ± 1%   479.4µ ± 1%   -9.12% (p=0.000 n=20)
    168                            254.5µ ± 1%   216.7µ ± 1%  -14.86% (p=0.000 n=20)
    2k                             169.5µ ± 1%   128.4µ ± 0%  -24.24% (p=0.000 n=20)
    256k                           161.2µ ± 1%   118.6µ ± 1%  -26.42% (p=0.000 n=20)
    16m                            56.58m ± 0%   53.91m ± 0%   -4.72% (p=0.000 n=20)
    geomean                        730.2µ        611.2µ       -16.29%
    
            │ ./results/memset/memset_baseline │    ./results/memset/memset_scalar     │
            │               B/s                │      B/s       vs base                │
    40                            452.0Mi ± 1%    497.3Mi ± 1%  +10.04% (p=0.000 n=20)
    168                           936.9Mi ± 1%   1100.4Mi ± 1%  +17.45% (p=0.000 n=20)
    2k                            1.373Gi ± 1%    1.813Gi ± 0%  +32.00% (p=0.000 n=20)
    256k                          1.444Gi ± 1%    1.962Gi ± 1%  +35.91% (p=0.000 n=20)
    16m                           269.7Mi ± 0%    283.1Mi ± 0%   +4.96% (p=0.000 n=20)
    geomean                       750.1Mi         896.1Mi       +19.47%
    
    MFC after:      1 month
    MFC to:         stable/15
    Approved by:    mhorne, markj (mentor)
    Reviewed by:    fuz
    Sponsored by:   Google LLC (GSoc 2024)
    Differential Revision:  https://reviews.freebsd.org/D45730
---
 lib/libc/riscv/string/Makefile.inc |  1 +
 lib/libc/riscv/string/memset.S     | 95 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc
index cdbc19d286fc..44aeb65bf1f7 100644
--- a/lib/libc/riscv/string/Makefile.inc
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -1,3 +1,4 @@
 MDSRCS+= \
 	memchr.S \
+	memset.S \
 	strrchr.S
diff --git a/lib/libc/riscv/string/memset.S b/lib/libc/riscv/string/memset.S
new file mode 100644
index 000000000000..ca435dfdd5c1
--- /dev/null
+++ b/lib/libc/riscv/string/memset.S
@@ -0,0 +1,95 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * register a0 - void *dest
+ * register a1 - int c
+ * register a2 - size_t len
+ */
+ENTRY(memset)
+	andi a1, a1, 0xFF
+
+	sltiu t1, a2, 8
+	mv t0, a0
+	bnez t1, .Lend
+
+	li t1, 0x0101010101010101
+	mul a1, a1, t1
+
+	andi t1, a0, 0b111
+	andi t0, a0, ~0b111
+
+	beqz t1, .Lloop_store_64
+
+	la t2, .Lduff_start
+	slli t3, t1, 2
+	add t2, t2, t3
+	jr -4(t2)
+.Lduff_start:
+	sb a1, 1(t0)
+	sb a1, 2(t0)
+	sb a1, 3(t0)
+	sb a1, 4(t0)
+	sb a1, 5(t0)
+	sb a1, 6(t0)
+	sb a1, 7(t0)
+
+	/* a3 = a3 -(8-a) <=> a3 = a3 + (a-8) */
+	addi t1, t1, -8
+	add a2, a2, t1
+	addi t0, t0, 8
+
+.Lloop_store_64:
+	slti t1, a2, 64
+	bnez t1, .Lstore_rest
+	sd a1, 0(t0)
+	sd a1, 8(t0)
+	sd a1, 16(t0)
+	sd a1, 24(t0)
+	sd a1, 32(t0)
+	sd a1, 40(t0)
+	sd a1, 48(t0)
+	sd a1, 56(t0)
+	addi a2, a2, -64
+	addi t0, t0, 64
+	j .Lloop_store_64
+
+.Lstore_rest:
+	la t2, .Lduff_rest
+	andi t3, a2, ~0b111
+	srli t4, t3, 1
+	sub t2, t2, t4
+	jr t2
+	sd a1, 56(t0)
+	sd a1, 48(t0)
+	sd a1, 40(t0)
+	sd a1, 32(t0)
+	sd a1, 24(t0)
+	sd a1, 16(t0)
+	sd a1, 8(t0)
+	sd a1, 0(t0)
+.Lduff_rest:
+	add t0, t0, t3
+	sub a2, a2, t3
+
+.Lend:
+	slli a2, a2, 2
+	la t2, .Lduff_end
+	sub t2, t2, a2
+	jr t2
+	sb a1, 6(t0)
+	sb a1, 5(t0)
+	sb a1, 4(t0)
+	sb a1, 3(t0)
+	sb a1, 2(t0)
+	sb a1, 1(t0)
+	sb a1, (t0)
+.Lduff_end:
+	ret
+END(memset)
+