git: 25fdd86a4c92 - main - libc: scalar memcpy() in RISC-V assembly

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 31 Oct 2025 12:48:36 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=25fdd86a4c92b5bdab82db289f3bcd57756778e7

commit 25fdd86a4c92b5bdab82db289f3bcd57756778e7
Author:     Strahinja Stanišić <strajabot@FreeBSD.org>
AuthorDate: 2024-07-24 23:33:30 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-10-31 12:47:59 +0000

    libc: scalar memcpy() in RISC-V assembly
    
    Optimized assembly implementation of memcpy() for the RISC-V architecture.
    The implementation has two paths:
    
     - An aligned path - (dst - src) % 8 = 0, runs faster
     - An unaligned path - (dst - src) % 8 != 0, runs slower
    
    os: FreeBSD
    arch: riscv
               │ memcpy_baseline │            memcpy_scalar            │
               │     sec/op      │   sec/op     vs base                │
    64Align8         851.6µ ± 1%   488.9µ ± 1%  -42.59% (p=0.000 n=12)
    4kAlign8         681.5µ ± 1%   255.1µ ± 2%  -62.57% (p=0.000 n=12)
    256kAlign8       273.0µ ± 2%   230.7µ ± 2%  -15.50% (p=0.000 n=12)
    16mAlign8        98.07m ± 0%   95.29m ± 0%   -2.84% (p=0.000 n=12)
    64UAlign         887.5µ ± 1%   531.6µ ± 1%  -40.10% (p=0.000 n=12)
    4kUAlign         725.6µ ± 1%   262.2µ ± 1%  -63.87% (p=0.000 n=12)
    256kUAlign       844.1µ ± 2%   322.8µ ± 0%  -61.76% (p=0.000 n=12)
    16mUAlign        134.9m ± 0%   101.2m ± 0%  -24.97% (p=0.000 n=20)
    geomean          2.410m        1.371m       -43.12%
    
               │ memcpy_baseline │            memcpy_scalar             │
               │      MiB/s      │    MiB/s     vs base                 │
    64Align8          293.6 ± 1%    511.3 ± 1%   +74.18% (p=0.000 n=12)
    4kAlign8          366.8 ± 1%    980.0 ± 2%  +167.15% (p=0.000 n=12)
    256kAlign8        915.8 ± 2%   1083.7 ± 2%   +18.34% (p=0.000 n=12)
    16mAlign8         163.1 ± 0%    167.9 ± 0%    +2.92% (p=0.000 n=12)
    64UAlign          281.7 ± 1%    470.3 ± 1%   +66.94% (p=0.000 n=12)
    4kUAlign          344.5 ± 1%    953.6 ± 1%  +176.77% (p=0.000 n=12)
    256kUAlign        296.2 ± 2%    774.5 ± 0%  +161.49% (p=0.000 n=12)
    16mUAlign         118.6 ± 0%    158.1 ± 0%   +33.28% (p=0.000 n=20)
    geomean           293.4         515.8        +75.81%
    
    MFC after:      1 month
    MFC to:         stable/15
    Approved by:    mhorne, markj (mentor)
    Reviewed by:    fuz
    Sponsored by:   Google LLC (GSoC 2024)
    Differential Revision:  https://reviews.freebsd.org/D46139
---
 lib/libc/riscv/string/Makefile.inc |   1 +
 lib/libc/riscv/string/memcpy.S     | 217 +++++++++++++++++++++++++++++++++++++
 2 files changed, 218 insertions(+)

diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc
index 5853ea114277..ebea8d1d3412 100644
--- a/lib/libc/riscv/string/Makefile.inc
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -1,5 +1,6 @@
 MDSRCS+= \
 	memchr.S \
+	memcpy.S \
 	memset.S \
 	strlen.S \
 	strrchr.S
diff --git a/lib/libc/riscv/string/memcpy.S b/lib/libc/riscv/string/memcpy.S
new file mode 100644
index 000000000000..7536514df777
--- /dev/null
+++ b/lib/libc/riscv/string/memcpy.S
@@ -0,0 +1,217 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - void* dst
+ * a1 - const void* src
+ * a2 - size_t len
+ */
+ENTRY(memcpy)
+	beqz a2, .Lreturn
+
+	/* diff = (dstv - srcv) & 0b111 */
+	sub t0, a0, a1
+	andi t0, t0, 0b111
+
+	sltiu t1, a2, 8
+
+	/* we never change a0, because memcpy returns the original dst */
+	mv a3, a0
+
+	/* len < 8 */
+	bnez t1, .Lend
+
+	/* t1 = (-dst) & 0b111 */
+	neg t1, a0
+	andi t1, t1, 0b111
+
+	sub a2, a2, t1
+
+	la t2, .Lduff_start
+	slli t3, t1, 3
+	sub t2, t2, t3
+	jr t2
+	lb t3, 6(a1)
+	sb t3, 6(a3)
+	lb t3, 5(a1)
+	sb t3, 5(a3)
+	lb t3, 4(a1)
+	sb t3, 4(a3)
+	lb t3, 3(a1)
+	sb t3, 3(a3)
+	lb t3, 2(a1)
+	sb t3, 2(a3)
+	lb t3, 1(a1)
+	sb t3, 1(a3)
+	lb t3, 0(a1)
+	sb t3, 0(a3)
+.Lduff_start:
+
+	add a1, a1, t1
+	add a3, a3, t1
+
+	beqz a2, .Lreturn
+
+	beqz t0, .Lmemcpy8
+
+	/*
+	 * a4 - size_t right_shift
+	 * a5 - size_t left_shift
+	 * a6 - size_t whole (number of dword stores)
+	 */
+
+	/* right_shift = (src % 0b111) * 8; */
+	andi a4, a1, 0b111
+	slli a4, a4, 3
+
+	/* left_shift = 64 - right_shift */
+	neg a5, a4
+
+	/* whole = len / 8 */
+	srli a6, a2, 3
+
+	/* len = len % 8 */
+	andi a2, a2, 0b111
+
+	/* t0 - uint64_t* ptr */
+
+	/* ptr = src & ~0b111 */
+	andi t0, a1, ~0b111
+
+	/* src += whole * 8 */
+	slli t1, a6, 3
+	add a1, a1, t1
+
+	/*
+	 * t1 - uint64_t low
+	 * t2 - uint64_t high
+	 */
+
+	/* low = *ptr++ */
+	ld t1, (t0)
+	addi t0, t0, 8
+
+	/* low >>= right_shift */
+	srl t1, t1, a4
+
+	beqz a6, .Llmain_skip
+.Llmain:
+	/* high = *ptr++ */
+	ld t2, (t0)
+	addi t0, t0, 8
+
+	/* whole-- */
+	addi a6, a6, -1
+
+	/* temp = (high << left_shift) | low */
+	sll t3, t2, a5
+	or t3, t3, t1
+
+	/* low = high >> right_shift */
+	srl t1, t2, a4
+
+	/* *dst++ = temp */
+	sd t3, (a3)
+	addi a3, a3, 8
+
+	bnez a6, .Llmain
+
+.Llmain_skip:
+
+.Lend:
+	la t1, .Lduff_end
+	slli t2, a2, 3
+	sub t1, t1, t2
+	jr t1
+	lb t2, 6(a1)
+	sb t2, 6(a3)
+	lb t2, 5(a1)
+	sb t2, 5(a3)
+	lb t2, 4(a1)
+	sb t2, 4(a3)
+	lb t2, 3(a1)
+	sb t2, 3(a3)
+	lb t2, 2(a1)
+	sb t2, 2(a3)
+	lb t2, 1(a1)
+	sb t2, 1(a3)
+	lb t2, 0(a1)
+	sb t2, 0(a3)
+.Lduff_end:
+
+.Lreturn:
+	ret
+
+/* exectued when dst - src is multiple of 8
+ * a0 - void* dst
+ * a1 - const void* src
+ * a2 - size_t len
+ */
+.Lmemcpy8:
+
+	beqz a2, .Lreturn
+
+	slti t0, a2, 128
+	bnez t0, .Llmain8_64_skip
+
+	/* a4 - uint64_t* end_unroll */
+
+	/* end_unroll = dst + len / 64 * 64 */
+	andi t0, a2, ~0b111111
+	add a4, a3, t0
+
+	/* len = len % 64 */
+	andi a2, a2, 0b111111
+
+.Llmain8_64:
+	ld t0, 0(a1)
+	ld t1, 8(a1)
+	ld t2, 16(a1)
+	ld t3, 24(a1)
+	sd t0, 0(a3)
+	sd t1, 8(a3)
+	sd t2, 16(a3)
+	sd t3, 24(a3)
+	ld t0, 32(a1)
+	ld t1, 40(a1)
+	ld t2, 48(a1)
+	ld t3, 56(a1)
+	sd t0, 32(a3)
+	sd t1, 40(a3)
+	sd t2, 48(a3)
+	sd t3, 56(a3)
+	addi a3, a3, 64
+	addi a1, a1, 64
+	bne a3, a4, .Llmain8_64
+.Llmain8_64_skip:
+
+	beqz a2, .Lreturn
+
+	/* a4 - uint64_t* end_align */
+
+	/* end_align = (dst + len) & ~0b111 */
+	add a4, a3, a2
+	andi a4, a4, ~0b111
+
+	/* len = len % 8 */
+	andi a2, a2, 0b111
+
+	beq a3, a4, .Llmain8_skip
+.Llmain8:
+	ld t0, (a1)
+	sd t0, (a3)
+	addi a3, a3, 8
+	addi a1, a1, 8
+	bne a3, a4, .Llmain8
+.Llmain8_skip:
+
+	la t1, .Lduff_end
+	slli t2, a2, 3
+	sub t1, t1, t2
+	jr t1
+END(memcpy)