git: 61f4c4d3dd38 - main - lib/libc/amd64/string: add strchrnul implementations (scalar, baseline)
- Reply: Konstantin Belousov : "Re: git: 61f4c4d3dd38 - main - lib/libc/amd64/string: add strchrnul implementations (scalar, baseline)"
- Reply: Konstantin Belousov : "Re: git: 61f4c4d3dd38 - main - lib/libc/amd64/string: add strchrnul implementations (scalar, baseline)"
- Reply: Jessica Clarke : "Re: git: 61f4c4d3dd38 - main - lib/libc/amd64/string: add strchrnul implementations (scalar, baseline)"
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sun, 06 Aug 2023 14:10:06 UTC
The branch main has been updated by fuz:
URL: https://cgit.FreeBSD.org/src/commit/?id=61f4c4d3dd38c5b79e04fff1dedb02071ebb33f2
commit 61f4c4d3dd38c5b79e04fff1dedb02071ebb33f2
Author: Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-06-30 14:45:11 +0000
Commit: Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-08-06 13:58:27 +0000
lib/libc/amd64/string: add strchrnul implementations (scalar, baseline)
A lot better than the generic (pre) implementaion. We do not beat glibc
for long strings, likely due to glibc switching to AVX once the input is
sufficiently long. X86-64-v3 and v4 implementations may be added at a
future time.
os: FreeBSD
arch: amd64
cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
│ strchrnul_pre.out │ strchrnul_scalar.out │ strchrnul_baseline.out │
│ sec/op │ sec/op vs base │ sec/op vs base │
Short 129.68µ ± 3% 59.91µ ± 1% -53.80% (p=0.000 n=20) 44.37µ ± 1% -65.79% (p=0.000 n=20)
Mid 21.15µ ± 0% 19.30µ ± 0% -8.76% (p=0.000 n=20) 12.30µ ± 0% -41.85% (p=0.000 n=20)
Long 13.772µ ± 0% 11.028µ ± 0% -19.92% (p=0.000 n=20) 3.285µ ± 0% -76.15% (p=0.000 n=20)
geomean 33.55µ 23.36µ -30.37% 12.15µ -63.80%
│ strchrnul_pre.out │ strchrnul_scalar.out │ strchrnul_baseline.out │
│ B/s │ B/s vs base │ B/s vs base │
Short 919.3Mi ± 3% 1989.7Mi ± 1% +116.45% (p=0.000 n=20) 2686.8Mi ± 1% +192.28% (p=0.000 n=20)
Mid 5.505Gi ± 0% 6.033Gi ± 0% +9.60% (p=0.000 n=20) 9.466Gi ± 0% +71.97% (p=0.000 n=20)
Long 8.453Gi ± 0% 10.557Gi ± 0% +24.88% (p=0.000 n=20) 35.441Gi ± 0% +319.26% (p=0.000 n=20)
geomean 3.470Gi 4.983Gi +43.62% 9.584Gi +176.22%
For comparison, glibc on the same machine:
│ strchrnul_glibc.out │
│ sec/op │
Short 49.73µ ± 0%
Mid 14.60µ ± 0%
Long 1.237µ ± 0%
geomean 9.646µ
│ strchrnul_glibc.out │
│ B/s │
Short 2.341Gi ± 0%
Mid 7.976Gi ± 0%
Long 94.14Gi ± 0%
geomean 12.07Gi
Sponsored by: The FreeBSD Foundation
Approved by: mjg
Differential Revision: https://reviews.freebsd.org/D41333
---
lib/libc/amd64/string/Makefile.inc | 1 +
lib/libc/amd64/string/strchrnul.S | 167 +++++++++++++++++++++++++++++++++++++
2 files changed, 168 insertions(+)
diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index 1bfefa7be98c..b5260cfd78d1 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -8,6 +8,7 @@ MDSRCS+= \
memmove.S \
memset.S \
strcat.S \
+ strchrnul.S \
strcmp.S \
strlen.S \
stpcpy.S
diff --git a/lib/libc/amd64/string/strchrnul.S b/lib/libc/amd64/string/strchrnul.S
new file mode 100644
index 000000000000..6d22f31850f8
--- /dev/null
+++ b/lib/libc/amd64/string/strchrnul.S
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled
+
+ .weak strchrnul
+ .set strchrnul, __strchrnul
+
+ARCHFUNCS(__strchrnul)
+ ARCHFUNC(__strchrnul, scalar)
+ ARCHFUNC(__strchrnul, baseline)
+ENDARCHFUNCS(__strchrnul)
+
+/*
+ * strchrnul(str, c)
+ * This is implemented like strlen(str), but we check for the
+ * presence of both NUL and c in each iteration.
+ */
+ARCHENTRY(__strchrnul, scalar)
+ mov %edi, %ecx
+ and $~7, %rdi # align to 8 byte
+ movzbl %sil, %esi # clear stray high bits
+ movabs $0x0101010101010101, %r8
+ mov (%rdi), %rax # load first word
+ imul %r8, %rsi # replicate char 8 times
+ movabs $0x8080808080808080, %r9
+
+ /*
+ * Unaligned input: align to 8 bytes. Then proceed the same
+ * way as with aligned input, but ignore matches before the
+ * beginning of the string. This is achieved by shifting r9
+ * into r10 to have 0x00 bytes before the string begins.
+ */
+ shl $3, %ecx
+ mov %r9, %r10
+ add $8, %rdi
+ shl %cl, %r10 # 0x80 where the string is
+ neg %r8 # negate 01..01 so we can use lea
+
+ mov %rsi, %rcx
+ xor %rax, %rcx # str ^ c
+ lea (%rax, %r8, 1), %rdx # str - 0x01..01
+ lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01
+ not %rax # ~str
+ not %rcx # ~(str ^ c)
+ and %rdx, %rax # (str - 0x01..01) & ~str
+ and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
+ or %rcx, %rax # matches for both
+ and %r10, %rax # not including junk bytes or bytes before the string
+ jnz 1f
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+0: mov (%rdi), %rax # str
+ mov %rsi, %rcx
+ xor %rax, %rcx # str ^ c
+ lea (%rax, %r8, 1), %rdx # str - 0x01..01
+ lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01
+ not %rax # ~str
+ not %rcx # ~(str ^ c)
+ and %rdx, %rax # (str - 0x01..01) & ~str
+ and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
+ or %rcx, %rax # matches for both
+ and %r9, %rax # not including junk bits
+ jnz 2f
+
+ mov 8(%rdi), %rax # str
+ add $16, %rdi
+ mov %rsi, %rcx
+ xor %rax, %rcx # str ^ c
+ lea (%rax, %r8, 1), %rdx # str - 0x01..01
+ lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01
+ not %rax # ~str
+ not %rcx # ~(str ^ c)
+ and %rdx, %rax # (str - 0x01..01) & ~str
+ and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
+ or %rcx, %rax # matches for both
+ and %r9, %rax # not including junk bits
+ jz 0b
+
+ /* NUL or c found */
+1: sub $8, %rdi # undo advance past buffer
+2: tzcnt %rax, %rax # first NUL or c byte match
+ shr $3, %eax # scale from bit to byte index
+ add %rdi, %rax # pointer to found c or NUL
+ ret
+ARCHEND(__strchrnul, scalar)
+
+ARCHENTRY(__strchrnul, baseline)
+ mov %edi, %ecx
+ and $~0xf, %rdi # align to 16 byte
+ movdqa (%rdi), %xmm1
+ movd %esi, %xmm0
+ and $0xf, %ecx # distance from (%rdi) to start of string
+ pxor %xmm2, %xmm2
+ mov $-1, %edx
+ punpcklbw %xmm0, %xmm0 # c -> cc
+ shl %cl, %edx # bits corresponding to bytes in the string
+ punpcklwd %xmm0, %xmm0 # cc -> cccc
+ add $16, %rdi
+
+ /* check for match in head */
+ pcmpeqb %xmm1, %xmm2 # NUL bytes present?
+ pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc
+ pcmpeqb %xmm0, %xmm1 # c present?
+ por %xmm2, %xmm1 # either present?
+ pmovmskb %xmm1, %eax
+ and %edx, %eax # match in the string?
+ jnz 1f
+
+ /* main loop unrolled twice */
+ ALIGN_TEXT
+0: movdqa (%rdi), %xmm1
+ pxor %xmm2, %xmm2
+ pcmpeqb %xmm1, %xmm2 # NUL bytes present?
+ pcmpeqb %xmm0, %xmm1 # c present?
+ por %xmm2, %xmm1 # either present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # match in the string?
+ jnz 2f
+
+ movdqa 16(%rdi), %xmm1
+ add $32, %rdi
+ pxor %xmm2, %xmm2
+ pcmpeqb %xmm1, %xmm2 # NUL bytes present?
+ pcmpeqb %xmm0, %xmm1 # c present?
+ por %xmm2, %xmm1 # either present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # match in the string?
+ jz 0b
+
+1: sub $16, %rdi # undo advance past buffer
+2: tzcnt %eax, %eax # where is the match?
+ add %rdi, %rax # pointer to found c or NUL
+ ret
+ARCHEND(__strchrnul, baseline)
+
+ .section .note.GNU-stack,"",%progbits