git: 74d6cfad54d6 - main - lib/libc/amd64/string: add strlcpy scalar, baseline implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Mon, 25 Dec 2023 14:25:54 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=74d6cfad54d676299ee5e4695139461876dfd757

commit 74d6cfad54d676299ee5e4695139461876dfd757
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-11-12 22:47:06 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-12-25 13:56:05 +0000

    lib/libc/amd64/string: add strlcpy scalar, baseline implementation
    
    Somewhat similar to stpncpy, but different in that we need to compute
    the full source length even if the buffer is shorter than the source.
    
    strlcat is implemented as a simple wrapper around strlcpy.  The scalar
    implementation of strlcpy just calls into strlen() and memcpy() to do
    the job.
    
    Perf-wise we're very close to stpncpy.  The code is slightly slower as
    it needs to carry on with finding the source string length even if the
    buffer ends before the string.
    
    Sponsored by:   The FreeBSD Foundation
    Tested by:      developers@, exp-run
    Approved by:    mjg
    MFC after:      1 month
    MFC to:         stable/14
    PR:             275785
    Differential Revision: https://reviews.freebsd.org/D42863
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/strlcpy.S    | 281 +++++++++++++++++++++++++++++++++++++
 2 files changed, 282 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index d982061e080b..03bca498e116 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -13,6 +13,7 @@ MDSRCS+= \
 	strcmp.S \
 	strcpy.c \
 	strcspn.S \
+	strlcpy.S \
 	strlen.S \
 	strncmp.S \
 	strncpy.c \
diff --git a/lib/libc/amd64/string/strlcpy.S b/lib/libc/amd64/string/strlcpy.S
new file mode 100644
index 000000000000..2b32c6c78047
--- /dev/null
+++ b/lib/libc/amd64/string/strlcpy.S
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT	.p2align 4, 0x90
+
+	.weak strlcpy
+	.set strlcpy, __strlcpy
+ARCHFUNCS(__strlcpy)
+	ARCHFUNC(__strlcpy, scalar)
+	ARCHFUNC(__strlcpy, baseline)
+ENDARCHFUNCS(__strlcpy)
+
+ARCHENTRY(__strlcpy, scalar)
+	push	%rbp		# establish stack frame
+	mov	%rsp, %rbp
+	push	%rsi
+	push	%rbx
+	push	%rdi
+	push	%rdx
+	mov	%rsi, %rdi
+	call	CNAME(strlen)	# strlen(src)
+	pop	%rdx
+	pop	%rdi
+	mov	-8(%rbp), %rsi
+	mov	%rax, %rbx	# remember string length for return value
+	sub	$1, %rdx	# do not copy into the final byte of the buffer
+	jc	0f		# skip copying altogether if buffer was empty
+	cmp	%rax, %rdx	# is the buffer longer than the input?
+	cmova	%rax, %rdx	# if yes, only copy the part that fits
+	movb	$0, (%rdi, %rdx, 1) # NUL-terminate output buffer
+	call	CNAME(memcpy)	# copy string to output
+0:	mov	%rbx, %rax	# restore return value
+	pop	%rbx
+	leave
+	ret
+ARCHEND(__strlcpy, scalar)
+
+ARCHENTRY(__strlcpy, baseline)
+	sub		$1, %rdx		# do not count NUL byte in buffer length
+	jb		.L0			# go to special code path if len was 0
+
+	mov		%esi, %ecx
+	pxor		%xmm1, %xmm1
+	mov		%rsi, %r9		# stash a copy of the source pointer for later
+	and		$~0xf, %rsi
+	pcmpeqb		(%rsi), %xmm1		# NUL found in head?
+	mov		$-1, %r8d
+	and		$0xf, %ecx
+	shl		%cl, %r8d		# mask of bytes in the string
+	pmovmskb	%xmm1, %eax
+	and		%r8d, %eax
+	jnz		.Lhead_nul
+
+	movdqa		16(%rsi), %xmm3		# load second string chunk
+	movdqu		(%r9), %xmm2		# load unaligned string head
+	mov		$32, %r8d
+	sub		%ecx, %r8d		# head length + length of second chunk
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm3, %xmm1		# NUL found in second chunk?
+
+	sub		%r8, %rdx		# enough space left for the second chunk?
+	jbe		.Lhead_buf_end
+
+	/* process second chunk */
+	pmovmskb	%xmm1, %eax
+	test		%eax, %eax
+	jnz		.Lsecond_nul
+
+	/* string didn't end in second chunk and neither did buffer -- not a runt! */
+	movdqa		32(%rsi), %xmm0		# load next string chunk
+	pxor		%xmm1, %xmm1
+	movdqu		%xmm2, (%rdi)		# deposit head into buffer
+	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
+	movdqu		%xmm3, 16(%rdi)		# deposit second chunk
+	sub		%rsi, %rdi		# express RDI as distance from RSI
+	add		$32, %rsi		# advance RSI past first two chunks
+	sub		$16, %rdx		# enough left for another round?
+	jbe		1f
+
+	/* main loop unrolled twice */
+	ALIGN_TEXT
+0:	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
+	pmovmskb	%xmm1, %eax
+	test		%eax, %eax
+	jnz		3f
+
+	movdqu		%xmm0, (%rsi, %rdi)
+	movdqa		16(%rsi), %xmm0		# load next string chunk
+	pxor		%xmm1, %xmm1
+	cmp		$16, %rdx		# more than a full chunk left?
+	jbe		2f
+
+	add		$32, %rsi		# advance pointers to next chunk
+	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
+	pmovmskb	%xmm1, %eax
+	test		%eax, %eax
+	jnz		4f
+
+	movdqu		%xmm0, -16(%rsi, %rdi)
+	movdqa		(%rsi), %xmm0		# load next string chunk
+	pxor		%xmm1, %xmm1
+	sub		$32, %rdx
+	ja		0b
+
+1:	sub		$16, %rsi		# undo second advancement
+	add		$16, %edx
+
+	/* 1--16 bytes left in the buffer but string has not ended yet */
+2:	pcmpeqb		%xmm1, %xmm0		# NUL byte encountered?
+	pmovmskb	%xmm0, %r8d
+	mov		%r8d, %eax
+	bts		%edx, %r8d		# treat end of buffer as end of string
+	tzcnt		%r8d, %r8d		# find tail length
+	add		%rsi, %rdi		# restore RDI
+	movdqu		(%rsi, %r8, 1), %xmm0	# load string tail
+	movdqu		%xmm0, (%rdi, %r8, 1)	# store string tail
+	movb		$0, 16(%rdi, %r8, 1)	# NUL terminate
+
+	/* continue to find the end of the string */
+	test		%eax, %eax		# end of string already reached?
+	jnz		1f
+
+	ALIGN_TEXT
+0:	pcmpeqb		32(%rsi), %xmm1
+	pmovmskb	%xmm1, %eax
+	pxor		%xmm1, %xmm1
+	test		%eax, %eax
+	jnz		2f
+
+	pcmpeqb		48(%rsi), %xmm1
+	pmovmskb	%xmm1, %eax
+	add		$32, %rsi
+	pxor		%xmm1, %xmm1
+	test		%eax, %eax
+	jz		0b
+
+1:	sub		$16, %rsi		# undo second advancement
+2:	tzcnt		%eax, %eax		# where is the NUL byte?
+	sub		%r9, %rsi
+	lea		32(%rsi, %rax, 1), %rax	# return string length
+	ret
+
+4:	sub		$16, %rsi		# undo second advancement
+	add		$16, %rdx		# restore number of remaining bytes
+
+	/* string has ended but buffer has not */
+3:	tzcnt		%eax, %eax		# find length of string tail
+	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
+	add		%rsi, %rdi		# restore destination pointer
+	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
+	sub		%r9, %rsi		# string length to current chunk
+	add		%rsi, %rax		# plus length of current chunk
+	ret
+
+.Lhead_buf_end:
+	pmovmskb	%xmm1, %r8d
+	add		$32, %edx		# restore edx to (len-1) + ecx
+	mov		%r8d, %eax
+	shl		$16, %r8d		# place 2nd chunk NUL mask into bits 16--31
+	bts		%rdx, %r8		# treat end of buffer as end of string
+	tzcnt		%r8, %rdx		# find string/bufer len from alignment boundary
+	sub		%ecx, %edx		# find actual string/buffer len
+	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
+
+	/* continue to find the end of the string */
+	test		%eax, %eax		# end of string already reached?
+	jnz		1f
+
+	ALIGN_TEXT
+0:	pcmpeqb		32(%rsi), %xmm1
+	pmovmskb	%xmm1, %eax
+	pxor		%xmm1, %xmm1
+	test		%eax, %eax
+	jnz		2f
+
+	pcmpeqb		48(%rsi), %xmm1
+	pmovmskb	%xmm1, %eax
+	add		$32, %rsi
+	pxor		%xmm1, %xmm1
+	test		%eax, %eax
+	jz		0b
+
+1:	sub		$16, %rsi
+2:	tzcnt		%eax, %eax
+	sub		%r9, %rsi
+	lea		32(%rsi, %rax, 1), %rax	# return string length
+	jmp		.L0031
+
+.Lsecond_nul:
+	add		%r8, %rdx		# restore buffer length
+	tzcnt		%eax, %eax		# where is the NUL byte?
+	lea		-16(%rcx), %r8d
+	sub		%r8d, %eax		# string length
+	cmp		%rax, %rdx		# is the string shorter than the buffer?
+	cmova		%rax, %rdx		# copy only min(buflen, srclen) bytes
+	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
+.L0031:	cmp		$16, %rdx		# at least 16 bytes to copy (not incl NUL)?
+	jb		.L0015
+
+	/* copy 16--31 bytes */
+	movdqu		(%r9), %xmm0		# load first 16 bytes
+	movdqu		-16(%r9, %rdx, 1), %xmm1 # load last 16 bytes
+	movdqu		%xmm0, (%rdi)
+	movdqu		%xmm1, -16(%rdi, %rdx, 1)
+	ret
+
+.Lhead_nul:
+	tzcnt		%eax, %eax		# where is the NUL byte?
+	sub		%ecx, %eax		# ... from the beginning of the string?
+	cmp		%rax, %rdx		# is the string shorter than the buffer?
+	cmova		%rax, %rdx		# copy only min(buflen, srclen) bytes
+	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
+
+	/* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */
+.L0015:	cmp		$8, %rdx		# at least 8 bytes to copy?
+	jae		.L0815
+
+	cmp		$4, %rdx		# at least 4 bytes to copy?
+	jae		.L0407
+
+	cmp		$2, %rdx		# at least 2 bytes to copy?
+	jae		.L0203
+
+	movzbl		(%r9), %ecx		# load first byte from src
+	mov		%cl, (%rdi)		# deposit into destination
+	movb		$0, (%rdi, %rdx, 1)	# add NUL terminator (again)
+	ret
+
+.L0203:	movzwl		(%r9), %ecx
+	movzwl		-2(%r9, %rdx, 1), %esi
+	mov		%cx, (%rdi)
+	mov		%si, -2(%rdi, %rdx, 1)
+	ret
+
+.L0407:	mov		(%r9), %ecx
+	mov		-4(%r9, %rdx, 1), %esi
+	mov		%ecx, (%rdi)
+	mov		%esi, -4(%rdi, %rdx, 1)
+	ret
+
+.L0815:	mov		(%r9), %rcx
+	mov		-8(%r9, %rdx, 1), %rsi
+	mov		%rcx, (%rdi)
+	mov		%rsi, -8(%rdi, %rdx, 1)
+	ret
+
+	/* length zero destination: just return the string length */
+.L0:	mov		%rsi, %rdi
+	jmp		CNAME(strlen)
+ARCHEND(__strlcpy, baseline)
+
+	.section .note.GNU-stack,"",%progbits