git: 90253d49db09 - main - lib/libc/amd64/string: add stpncpy scalar, baseline implementation
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 25 Dec 2023 14:25:48 UTC
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=90253d49db09a9b1490c448d05314f3e4bbfa468 commit 90253d49db09a9b1490c448d05314f3e4bbfa468 Author: Robert Clausecker <fuz@FreeBSD.org> AuthorDate: 2023-10-30 03:15:46 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2023-12-25 13:55:42 +0000 lib/libc/amd64/string: add stpncpy scalar, baseline implementation This was surprisingly annoying to get right, despite being such a simple function. A scalar implementation is also provided, it just calls into our optimised memchr(), memcpy(), and memset() routines to carry out its job. I'm quite happy with the performance. glibc only beats us for very long strings, likely due to the use of AVX-512. The scalar implementation just calls into our optimised memchr(), memcpy(), and memset() routines, so it has a high overhead to begin with but then performs ok for the amount of effort that went into it. Still beats the old C code, except for very short strings. Sponsored by: The FreeBSD Foundation Tested by: developers@, exp-run Approved by: mjg MFC after: 1 month MFC to: stable/14 PR: 275785 Differential Revision: https://reviews.freebsd.org/D42519 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/stpncpy.S | 283 +++++++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+) diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index ee396f98eccc..cc8b0e825e3e 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -7,6 +7,7 @@ MDSRCS+= \ memmove.S \ memset.S \ stpcpy.S \ + stpncpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ diff --git a/lib/libc/amd64/string/stpncpy.S b/lib/libc/amd64/string/stpncpy.S new file mode 100644 index 000000000000..5ce0dd093a9e --- /dev/null +++ b/lib/libc/amd64/string/stpncpy.S @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker <fuz@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpncpy + .set stpncpy, __stpncpy +ARCHFUNCS(__stpncpy) + ARCHFUNC(__stpncpy, scalar) + ARCHFUNC(__stpncpy, baseline) +ENDARCHFUNCS(__stpncpy) + +ARCHENTRY(__stpncpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + + push %rdx + push %rdi + push %rsi + push %rax # dummy push for alignment + + mov %rsi, %rdi + xor %esi, %esi + call CNAME(__memchr) # memchr(src, '\0', len) + pop %rcx # dummy pop + pop %rsi + mov -16(%rbp), %rdi + + test %rax, %rax # NUL found? + jz .Lfullcopy + + mov %rax, %rdx + sub %rsi, %rdx # copy until the NUL byte + add %rdx, -16(%rbp) # advance destination by string length + sub %rdx, -8(%rbp) # and shorten buffer size by string length + call CNAME(memcpy) + + pop %rdi + pop %rdx + xor %esi, %esi + pop %rbp + jmp CNAME(memset) # clear remaining buffer + +.Lfullcopy: + mov -8(%rbp), %rdx + call CNAME(memcpy) # copy whole string + add -8(%rbp), %rax # point to dest[n] + leave + ret +ARCHEND(__stpncpy, scalar) + + /* + * this mask allows us to generate masks of 16-n 0xff bytes + * followed by n 0x00 bytes by loading from .Lmask+n. + */ + .section .rodata +.Lmask: .quad 0xffffffffffffffff + .quad 0xffffffffffffffff + .quad 0x0000000000000000 + .quad 0x0000000000000000 + +/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ +ARCHENTRY(__stpncpy, baseline) +#define bounce (-3*16-8) /* location of on-stack bounce buffer */ + + test %rdx, %rdx # no bytes to copy? + jz .L0 + + mov %esi, %ecx + and $~0xf, %rsi # align source to 16 bytes + movdqa (%rsi), %xmm0 # load head + and $0xf, %ecx # offset from alignment + mov $-1, %r9d + lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32 + shl %cl, %r9d # mask of bytes belonging to the string + sub %rcx, %rdi # adjust RDI to correspond to RSI + pxor %xmm1, %xmm1 + movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %r8d + + lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary + add %rdx, %rax # less than 2 chunks (32 bytes) to play with? + jnc .Lrunt # if yes, use special runt processing + + movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination + and %r9d, %r8d # end of string within head? + jnz .Lheadnul + + movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer + movdqu %xmm2, (%rdi, %rcx, 1) # an deposit + + add $16, %rsi + add $16, %rdi + sub $32, %r10 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 3f + + movdqu %xmm0, (%rdi) + cmp $16, %r10 # more than a full chunk left? + jbe 1f + + movdqa 16(%rsi), %xmm0 + add $32, %rdi # advance pointers to next chunk + add $32, %rsi + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 2f + + movdqu %xmm0, -16(%rdi) + sub $32, %r10 # more than another full chunk left? + ja 0b + + sub $16, %rdi # undo second advancement + sub $16, %rsi + add $16, %r10d # restore number of remaining bytes + + /* 1--16 bytes left but string has not ended yet */ +1: pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail? + pmovmskb %xmm1, %r8d + bts %r10d, %r8d # treat end of buffer as NUL + tzcnt %r8d, %r8d # where is the NUL byte? + movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL + lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte + # or end of buffer + movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer + ret + +2: sub $16, %rdi # undo second advancement + sub $16, %rsi + sub $16, %r10 + + /* string has ended and buffer has not */ +3: tzcnt %r8d, %r8d # where did the string end? + lea .Lmask+16(%rip), %rcx + lea (%rdi, %r8, 1), %rax # where the NUL byte will be + neg %r8 + movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, + # 00 where it is not + pand %xmm1, %xmm0 # mask out bytes after the string + movdqu %xmm0, (%rdi) # store masked current chunk + pxor %xmm1, %xmm1 + sub $16, %r10 # another full chunk left? + jbe 1f + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, 16(%rdi) + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 32(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* at least two chunks to play with and NUL while processing head */ +.Lheadnul: + movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack + tzcnt %r8d, %r8d # find location of NUL byte + movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination + movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes + movdqu %xmm1, 16(%rdi) # clear out second chunk + lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte + + add $32, %rdi # advance past first two chunks + sub $32+16, %r10 # advance past first three chunks + jbe 1f # did we pass the end of the buffer? + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, (%rdi) # clear out buffer chunk + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 16(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* 1--32 bytes to copy, bounce through the stack */ +.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy + bts %r10d, %r8d # treat end of buffer as end of string + and %r9w, %r8w # end of string within first buffer? + jnz 0f # if yes, do not inspect second buffer + + movdqa 16(%rsi), %xmm0 # load second chunk of input + movdqa %xmm0, bounce+16(%rsp) # stash copy on stack + pcmpeqb %xmm1, %xmm0 # NUL in second chunk? + pmovmskb %xmm0, %r9d + shl $16, %r9d + or %r9d, %r8d # merge found NUL bytes into NUL mask + + /* end of string after one buffer */ +0: tzcnt %r8d, %r8d # location of last char in string + movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string + lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack + lea (%rdi, %r8, 1), %rax # return pointer to NUL byte + + cmp $16, %edx # at least 16 bytes to transfer? + jae .L1631 + + mov (%rsi), %r8 # load string head + cmp $8, %edx # at least 8 bytes to transfer? + jae .L0815 + + cmp $4, %edx # at least 4 bytes to transfer? + jae .L0407 + + movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string + mov %r8b, (%rdi, %rcx, 1) # store first byte + + cmp $2, %edx # at least 2 bytes to transfer? + jb .L1 + + mov %si, -2(%rdi, %r10, 1) # store last two bytes of string +.L1: ret + +.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string + movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string + movdqu %xmm0, (%rdi, %rcx, 1) + movdqu %xmm1, -16(%rdi, %r10, 1) + ret + +.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string + mov %r8, (%rdi, %rcx, 1) + mov %rdx, -8(%rdi, %r10, 1) + ret + +.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string + mov %r8d, (%rdi, %rcx, 1) + mov %edx, -4(%rdi, %r10, 1) + ret + + /* length 0 buffer: just return dest */ +.L0: mov %rdi, %rax + ret +ARCHEND(__stpncpy, baseline) + + .section .note.GNU-stack,"",%progbits