git: af366d353b84 - main - amd64: implement strlen in assembly

Mateusz Guzik mjg at FreeBSD.org
Mon Feb 8 19:15:34 UTC 2021


The branch main has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=af366d353b84bdc4e730f0fc563853abc338271c

commit af366d353b84bdc4e730f0fc563853abc338271c
Author:     Mateusz Guzik <mjg at FreeBSD.org>
AuthorDate: 2021-02-08 17:01:48 +0000
Commit:     Mateusz Guzik <mjg at FreeBSD.org>
CommitDate: 2021-02-08 19:15:21 +0000

    amd64: implement strlen in assembly
    
    The C variant in libkern performs excessive branching to find the
    non-zero byte instead of using the bsfq instruction. The same code
    patched to use it is still slower than the routine implemented here
    as the compiler keeps neglecting to perform certain optimizations
    (like using leaq).
    
    On top of that the routine can is a starting point for copyinstr
    which operates on words instead of bytes.
    
    Tested with glibc test suite.
    
    Sample results (calls/s):
    
    Haswell:
    $(perl -e "print 'A' x 3"):
    stock:  211198039
    patched:338626619
    asm:    465609618
    
    $(perl -e "print 'A' x 100"):
    stock:   83151997
    patched: 98285919
    asm:    120719888
    
    AMD EPYC 7R32:
    $(perl -e "print 'A' x 3"):
    stock:  282523617
    asm:    491498172
    
    $(perl -e "print 'A' x 100"):
    stock:  114857172
    asm:    112082057
---
 sys/amd64/amd64/support.S | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 sys/conf/files            |  1 -
 sys/conf/files.arm        |  1 +
 sys/conf/files.arm64      |  1 +
 sys/conf/files.i386       |  1 +
 sys/conf/files.mips       |  1 +
 sys/conf/files.powerpc    |  1 +
 sys/conf/files.riscv      |  1 +
 8 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index b623fba277db..994c5f15e245 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -697,6 +697,72 @@ ENTRY(fillw)
 	ret
 END(fillw)
 
+/*
+ * strlen(string)
+ *	  %rdi
+ *
+ * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
+ *
+ * 0x80....80 is replaced with 0 - 0x80....80 so that it can be added
+ * with leaq.
+ *
+ * For a description see either:
+ * - "Hacker's Delight" by Henry S. Warren, Jr.
+ * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
+ *   by Agner Fog
+ *
+ * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
+ */
+ENTRY(strlen)
+	PUSH_FRAME_POINTER
+	movabsq	$0xfefefefefefefeff,%r8
+	movabsq	$0x8080808080808080,%r9
+
+	movq	%rdi,%r10
+	movq	%rdi,%rcx
+	testb	$7,%dil
+	jz	2f
+
+	/*
+	 * Handle misaligned reads: align to 8 and fill
+	 * the spurious bytes.
+	 */
+	andq	$~7,%rdi
+	movq	(%rdi),%r11
+	shlq	$3,%rcx
+	movq	$-1,%rdx
+	shlq	%cl,%rdx
+	notq	%rdx
+	orq	%rdx,%r11
+
+	leaq	(%r11,%r8),%rcx
+	notq	%r11
+	andq	%r11,%rcx
+	andq	%r9,%rcx
+	jnz	3f
+
+	/*
+	 * Main loop.
+	 */
+	ALIGN_TEXT
+1:
+	leaq	8(%rdi),%rdi
+2:
+	movq	(%rdi),%r11
+	leaq	(%r11,%r8),%rcx
+	notq	%r11
+	andq	%rcx,%r11
+	andq	%r9,%rcx
+	jz	1b
+3:
+	bsfq	%rcx,%rcx
+	shrq	$3,%rcx
+	leaq	(%rcx,%rdi),%rax
+	subq	%r10,%rax
+	POP_FRAME_POINTER
+	ret
+END(strlen)
+
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
diff --git a/sys/conf/files b/sys/conf/files
index edca1003e904..1abfadb1e8d8 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4085,7 +4085,6 @@ libkern/strdup.c		standard
 libkern/strndup.c		standard
 libkern/strlcat.c		standard
 libkern/strlcpy.c		standard
-libkern/strlen.c		standard
 libkern/strncat.c		standard
 libkern/strncmp.c		standard
 libkern/strncpy.c		standard
diff --git a/sys/conf/files.arm b/sys/conf/files.arm
index eb3a23b5fc21..69986585bdf6 100644
--- a/sys/conf/files.arm
+++ b/sys/conf/files.arm
@@ -127,6 +127,7 @@ libkern/lshrdi3.c		standard
 libkern/memcmp.c		standard
 libkern/moddi3.c		standard
 libkern/qdivrem.c		standard
+libkern/strlen.c		standard
 libkern/ucmpdi2.c		standard
 libkern/udivdi3.c		standard
 libkern/umoddi3.c		standard
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index f7003b1048c8..42ec3b2787b1 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -432,6 +432,7 @@ libkern/memcmp.c		standard				\
 	compile-with "${NORMAL_C:N-fsanitize*}"
 libkern/memset.c		standard				\
 	compile-with "${NORMAL_C:N-fsanitize*}"
+libkern/strlen.c		standard
 libkern/arm64/crc32c_armv8.S	standard
 cddl/dev/dtrace/aarch64/dtrace_asm.S			optional dtrace compile-with "${DTRACE_S}"
 cddl/dev/dtrace/aarch64/dtrace_subr.c			optional dtrace compile-with "${DTRACE_C}"
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index 6560ab217d96..1e2ab5f8c52a 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -218,6 +218,7 @@ libkern/memcmp.c		standard
 libkern/memset.c		standard
 libkern/moddi3.c		standard
 libkern/qdivrem.c		standard
+libkern/strlen.c		standard
 libkern/ucmpdi2.c		standard
 libkern/udivdi3.c		standard
 libkern/umoddi3.c		standard
diff --git a/sys/conf/files.mips b/sys/conf/files.mips
index c18f0a5c69be..7ee5b0019bd7 100644
--- a/sys/conf/files.mips
+++ b/sys/conf/files.mips
@@ -66,6 +66,7 @@ libkern/ucmpdi2.c			optional	mips | mipshf | mipsel | mipselhf
 libkern/ashldi3.c			standard
 libkern/ashrdi3.c			standard
 libkern/memcmp.c			standard
+libkern/strlen.c			standard
 
 # cfe support
 dev/cfe/cfe_api.c			optional	cfe
diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc
index 3022fd6f6e39..347abee153d2 100644
--- a/sys/conf/files.powerpc
+++ b/sys/conf/files.powerpc
@@ -129,6 +129,7 @@ libkern/memcmp.c		standard
 libkern/memset.c		standard
 libkern/moddi3.c		optional	powerpc | powerpcspe
 libkern/qdivrem.c		optional	powerpc | powerpcspe
+libkern/strlen.c		standard
 libkern/ucmpdi2.c		optional	powerpc | powerpcspe
 libkern/udivdi3.c		optional	powerpc | powerpcspe
 libkern/umoddi3.c		optional	powerpc | powerpcspe
diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv
index 3969528db07e..7ecea016b9a3 100644
--- a/sys/conf/files.riscv
+++ b/sys/conf/files.riscv
@@ -29,6 +29,7 @@ libkern/flsl.c			standard
 libkern/flsll.c			standard
 libkern/memcmp.c		standard
 libkern/memset.c		standard
+libkern/strlen.c		standard
 riscv/riscv/autoconf.c		standard
 riscv/riscv/bus_machdep.c	standard
 riscv/riscv/bus_space_asm.S	standard


More information about the dev-commits-src-main mailing list