git: 90253d49db09 - main - lib/libc/amd64/string: add stpncpy scalar, baseline implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Mon, 25 Dec 2023 14:25:48 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=90253d49db09a9b1490c448d05314f3e4bbfa468

commit 90253d49db09a9b1490c448d05314f3e4bbfa468
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-10-30 03:15:46 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-12-25 13:55:42 +0000

    lib/libc/amd64/string: add stpncpy scalar, baseline implementation
    
    This was surprisingly annoying to get right, despite being such a simple
    function.  A scalar implementation is also provided, it just calls into
    our optimised memchr(), memcpy(), and memset() routines to carry out its
    job.
    
    I'm quite happy with the performance.  glibc only beats us for very long
    strings, likely due to the use of AVX-512.  The scalar implementation
    just calls into our optimised memchr(), memcpy(), and memset() routines,
    so it has a high overhead to begin with but then performs ok for the
    amount of effort that went into it.  Still beats the old C code, except
    for very short strings.
    
    Sponsored by:   The FreeBSD Foundation
    Tested by:      developers@, exp-run
    Approved by:    mjg
    MFC after:      1 month
    MFC to:         stable/14
    PR:             275785
    Differential Revision: https://reviews.freebsd.org/D42519
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/stpncpy.S    | 283 +++++++++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index ee396f98eccc..cc8b0e825e3e 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -7,6 +7,7 @@ MDSRCS+= \
 	memmove.S \
 	memset.S \
 	stpcpy.S \
+	stpncpy.S \
 	strcat.S \
 	strchrnul.S \
 	strcmp.S \
diff --git a/lib/libc/amd64/string/stpncpy.S b/lib/libc/amd64/string/stpncpy.S
new file mode 100644
index 000000000000..5ce0dd093a9e
--- /dev/null
+++ b/lib/libc/amd64/string/stpncpy.S
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT	.p2align 4, 0x90
+
+	.weak stpncpy
+	.set stpncpy, __stpncpy
+ARCHFUNCS(__stpncpy)
+	ARCHFUNC(__stpncpy, scalar)
+	ARCHFUNC(__stpncpy, baseline)
+ENDARCHFUNCS(__stpncpy)
+
+ARCHENTRY(__stpncpy, scalar)
+	push	%rbp		# establish stack frame
+	mov	%rsp, %rbp
+
+	push	%rdx
+	push	%rdi
+	push	%rsi
+	push	%rax		# dummy push for alignment
+
+	mov	%rsi, %rdi
+	xor	%esi, %esi
+	call	CNAME(__memchr)	# memchr(src, '\0', len)
+	pop	%rcx		# dummy pop
+	pop	%rsi
+	mov	-16(%rbp), %rdi
+
+	test	%rax, %rax	# NUL found?
+	jz	.Lfullcopy
+
+	mov	%rax, %rdx
+	sub	%rsi, %rdx	# copy until the NUL byte
+	add	%rdx, -16(%rbp)	# advance destination by string length
+	sub	%rdx, -8(%rbp)	# and shorten buffer size by string length
+	call	CNAME(memcpy)
+
+	pop	%rdi
+	pop	%rdx
+	xor	%esi, %esi
+	pop	%rbp
+	jmp	CNAME(memset)	# clear remaining buffer
+
+.Lfullcopy:
+	mov	-8(%rbp), %rdx
+	call	CNAME(memcpy)	# copy whole string
+	add	-8(%rbp), %rax	# point to dest[n]
+	leave
+	ret
+ARCHEND(__stpncpy, scalar)
+
+	/*
+	 * this mask allows us to generate masks of 16-n 0xff bytes
+	 * followed by n 0x00 bytes by loading from .Lmask+n.
+	 */
+	.section	.rodata
+.Lmask:	.quad		0xffffffffffffffff
+	.quad		0xffffffffffffffff
+	.quad		0x0000000000000000
+	.quad		0x0000000000000000
+
+/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
+ARCHENTRY(__stpncpy, baseline)
+#define bounce		(-3*16-8)		/* location of on-stack bounce buffer */
+
+	test		%rdx, %rdx		# no bytes to copy?
+	jz		.L0
+
+	mov		%esi, %ecx
+	and		$~0xf, %rsi		# align source to 16 bytes
+	movdqa		(%rsi), %xmm0		# load head
+	and		$0xf, %ecx		# offset from alignment
+	mov		$-1, %r9d
+	lea		-32(%rcx), %rax		# set up overflow-proof comparison rdx+rcx<=32
+	shl		%cl, %r9d		# mask of bytes belonging to the string
+	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
+	pxor		%xmm1, %xmm1
+	movdqa		%xmm0, bounce(%rsp)	# stash copy of head on the stack
+	pcmpeqb		%xmm1, %xmm0
+	pmovmskb	%xmm0, %r8d
+
+	lea		(%rdx, %rcx, 1), %r10	# buffer length from alignment boundary
+	add		%rdx, %rax		# less than 2 chunks (32 bytes) to play with?
+	jnc		.Lrunt			# if yes, use special runt processing
+
+	movdqu		%xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
+	and		%r9d, %r8d		# end of string within head?
+	jnz		.Lheadnul
+
+	movdqu		(%rsi, %rcx, 1), %xmm2	# load head from source buffer
+	movdqu		%xmm2, (%rdi, %rcx, 1)	# an deposit
+
+	add		$16, %rsi
+	add		$16, %rdi
+	sub		$32, %r10
+
+	/* main loop unrolled twice */
+	ALIGN_TEXT
+0:	movdqa		(%rsi), %xmm0
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
+	pmovmskb	%xmm1, %r8d
+	test		%r8d, %r8d
+	jnz		3f
+
+	movdqu		%xmm0, (%rdi)
+	cmp		$16, %r10		# more than a full chunk left?
+	jbe		1f
+
+	movdqa		16(%rsi), %xmm0
+	add		$32, %rdi		# advance pointers to next chunk
+	add		$32, %rsi
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
+	pmovmskb	%xmm1, %r8d
+	test		%r8d, %r8d
+	jnz		2f
+
+	movdqu		%xmm0, -16(%rdi)
+	sub		$32, %r10		# more than another full chunk left?
+	ja		0b
+
+	sub		$16, %rdi		# undo second advancement
+	sub		$16, %rsi
+	add		$16, %r10d		# restore number of remaining bytes
+
+	/* 1--16 bytes left but string has not ended yet */
+1:	pxor		%xmm1, %xmm1
+	pcmpeqb		16(%rsi), %xmm1		# NUL byte in source tail?
+	pmovmskb	%xmm1, %r8d
+	bts		%r10d, %r8d		# treat end of buffer as NUL
+	tzcnt		%r8d, %r8d		# where is the NUL byte?
+	movdqu		(%rsi, %r8, 1), %xmm0	# load source tail before NUL
+	lea		16(%rdi, %r8, 1), %rax	# point return value to NUL byte
+						# or end of buffer
+	movdqu		%xmm0, (%rdi, %r8, 1)	# store tail into the buffer
+	ret
+
+2:	sub		$16, %rdi		# undo second advancement
+	sub		$16, %rsi
+	sub		$16, %r10
+
+	/* string has ended and buffer has not */
+3:	tzcnt		%r8d, %r8d		# where did the string end?
+	lea		.Lmask+16(%rip), %rcx
+	lea		(%rdi, %r8, 1), %rax 	# where the NUL byte will be
+	neg		%r8
+	movdqu		(%rcx, %r8, 1), %xmm1	# mask with FF where the string is,
+						# 00 where it is not
+	pand		%xmm1, %xmm0		# mask out bytes after the string
+	movdqu		%xmm0, (%rdi)	 	# store masked current chunk
+	pxor		%xmm1, %xmm1
+	sub		$16, %r10		# another full chunk left?
+	jbe		1f
+
+	/* clear remaining destination buffer (tail has been cleared earlier) */
+	ALIGN_TEXT
+0:	movdqu		%xmm1, 16(%rdi)
+	cmp		$16, %r10
+	jbe		1f
+
+	movdqu		%xmm1, 32(%rdi)
+	add		$32, %rdi
+	sub		$32, %r10
+	ja		0b
+
+1:	ret
+
+	/* at least two chunks to play with and NUL while processing head */
+.Lheadnul:
+	movdqu		bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
+	tzcnt		%r8d, %r8d		# find location of NUL byte
+	movdqu		%xmm0, (%rdi, %rcx, 1)	# deposit head in the destination
+	movdqu		%xmm1, (%rdi, %r8, 1)	# clear out following bytes
+	movdqu		%xmm1, 16(%rdi)		# clear out second chunk
+	lea		(%rdi, %r8, 1), %rax	# make RAX point to the NUL byte
+
+	add		$32, %rdi		# advance past first two chunks
+	sub		$32+16, %r10		# advance past first three chunks
+	jbe		1f			# did we pass the end of the buffer?
+
+	/* clear remaining destination buffer (tail has been cleared earlier) */
+	ALIGN_TEXT
+0:	movdqu		%xmm1, (%rdi)		# clear out buffer chunk
+	cmp		$16, %r10
+	jbe		1f
+
+	movdqu		%xmm1, 16(%rdi)
+	add		$32, %rdi
+	sub		$32, %r10
+	ja		0b
+
+1:	ret
+
+	/* 1--32 bytes to copy, bounce through the stack */
+.Lrunt:	movdqa		%xmm1, bounce+16(%rsp)	# clear out rest of on-stack copy
+	bts		%r10d, %r8d		# treat end of buffer as end of string
+	and		%r9w, %r8w		# end of string within first buffer?
+	jnz		0f			# if yes, do not inspect second buffer
+
+	movdqa		16(%rsi), %xmm0		# load second chunk of input
+	movdqa		%xmm0, bounce+16(%rsp)	# stash copy on stack
+	pcmpeqb		%xmm1, %xmm0		# NUL in second chunk?
+	pmovmskb	%xmm0, %r9d
+	shl		$16, %r9d
+	or		%r9d, %r8d		# merge found NUL bytes into NUL mask
+
+	/* end of string after one buffer */
+0:	tzcnt		%r8d, %r8d		# location of last char in string
+	movdqu		%xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
+	lea		bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
+	lea		(%rdi, %r8, 1), %rax	# return pointer to NUL byte
+
+	cmp		$16, %edx		# at least 16 bytes to transfer?
+	jae		.L1631
+
+	mov		(%rsi), %r8		# load string head
+	cmp		$8, %edx		# at least 8 bytes to transfer?
+	jae		.L0815
+
+	cmp		$4, %edx		# at least 4 bytes to transfer?
+	jae		.L0407
+
+	movzwl		-2(%rsi, %rdx, 1), %esi	# load last two bytes of string
+	mov		%r8b, (%rdi, %rcx, 1)	# store first byte
+
+	cmp		$2, %edx		# at least 2 bytes to transfer?
+	jb		.L1
+
+	mov		%si, -2(%rdi, %r10, 1)	# store last two bytes of string
+.L1:	ret
+
+.L1631:	movdqu		(%rsi), %xmm0		# load first 16 bytes of string
+	movdqu		-16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
+	movdqu		%xmm0, (%rdi, %rcx, 1)
+	movdqu		%xmm1, -16(%rdi, %r10, 1)
+	ret
+
+.L0815:	mov		-8(%rsi, %rdx, 1), %rdx	# load last 8 bytes of string
+	mov		%r8, (%rdi, %rcx, 1)
+	mov		%rdx, -8(%rdi, %r10, 1)
+	ret
+
+.L0407:	mov		-4(%rsi, %rdx, 1), %edx	# load last four bytes of string
+	mov		%r8d, (%rdi, %rcx, 1)
+	mov		%edx, -4(%rdi, %r10, 1)
+	ret
+
+	/* length 0 buffer: just return dest */
+.L0:	mov		%rdi, %rax
+	ret
+ARCHEND(__stpncpy, baseline)
+
+	.section .note.GNU-stack,"",%progbits