svn commit: r357208 - head/sys/amd64/amd64
Mateusz Guzik
mjg at FreeBSD.org
Tue Jan 28 17:48:18 UTC 2020
Author: mjg
Date: Tue Jan 28 17:48:17 2020
New Revision: 357208
URL: https://svnweb.freebsd.org/changeset/base/357208
Log:
amd64: revamp memcmp
Borrow the trick from memset and memmove and use the scale/index/base addressing
to avoid branches.
If a mismatch is found, the routine has to calculate the difference. Make sure
there is always up to 8 bytes to inspect. This replaces the previous loop which
would operate over up to 16 bytes with an unrolled list of 8 tests.
Speed varies a lot, but this is a net win over the previous routine with probably
a lot more to gain.
Validated with glibc test suite.
Modified:
head/sys/amd64/amd64/support.S
Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S Tue Jan 28 17:48:14 2020 (r357207)
+++ head/sys/amd64/amd64/support.S Tue Jan 28 17:48:17 2020 (r357208)
@@ -111,92 +111,191 @@ END(sse2_pagezero)
*/
ENTRY(memcmp)
PUSH_FRAME_POINTER
+
+ xorl %eax,%eax
+10:
cmpq $16,%rdx
- jae 5f
+ ja 101632f
+
+100816:
+ cmpb $8,%dl
+ jl 100408f
+ movq (%rdi),%r8
+ movq (%rsi),%r9
+ cmpq %r8,%r9
+ jne 1f
+ movq -8(%rdi,%rdx),%r8
+ movq -8(%rsi,%rdx),%r9
+ cmpq %r8,%r9
+ jne 10081608f
+ POP_FRAME_POINTER
+ ret
+100408:
+ cmpb $4,%dl
+ jl 100204f
+ movl (%rsi),%r8d
+ movl (%rdi),%r9d
+ cmpl %r8d,%r9d
+ jne 1f
+ movl -4(%rsi,%rdx),%r8d
+ movl -4(%rdi,%rdx),%r9d
+ cmpl %r8d,%r9d
+ jne 1f
+ POP_FRAME_POINTER
+ ret
+100204:
+ cmpb $2,%dl
+ jl 100001f
+ movzwl (%rsi),%r8d
+ movzwl (%rdi),%r9d
+ cmpl %r8d,%r9d
+ jne 1f
+ movzwl -2(%rsi,%rdx),%r8d
+ movzwl -2(%rdi,%rdx),%r9d
+ cmpl %r8d,%r9d
+ jne 1f
+ POP_FRAME_POINTER
+ ret
+100001:
+ cmpb $1,%dl
+ jl 100000f
+ movzbl (%rdi),%r8d
+ movzbl (%rsi),%r9d
+ cmpb %r8b,%r9b
+ jne 1f
+100000:
+ POP_FRAME_POINTER
+ ret
+ALIGN_TEXT
+101632:
+ cmpq $32,%rdx
+ ja 103200f
+ movq (%rdi),%r8
+ movq (%rsi),%r9
+ cmpq %r8,%r9
+ jne 1f
+ movq 8(%rdi),%r8
+ movq 8(%rsi),%r9
+ cmpq %r8,%r9
+ jne 10163208f
+ movq -16(%rdi,%rdx),%r8
+ movq -16(%rsi,%rdx),%r9
+ cmpq %r8,%r9
+ jne 10163216f
+ movq -8(%rdi,%rdx),%r8
+ movq -8(%rsi,%rdx),%r9
+ cmpq %r8,%r9
+ jne 10163224f
+ POP_FRAME_POINTER
+ ret
+ALIGN_TEXT
+103200:
+ movq (%rdi),%r8
+ movq 8(%rdi),%r9
+ subq (%rsi),%r8
+ subq 8(%rsi),%r9
+ or %r8,%r9
+ jnz 10320000f
+
+ movq 16(%rdi),%r8
+ movq 24(%rdi),%r9
+ subq 16(%rsi),%r8
+ subq 24(%rsi),%r9
+ or %r8,%r9
+ jnz 10320016f
+
+ leaq 32(%rdi),%rdi
+ leaq 32(%rsi),%rsi
+ subq $32,%rdx
+ cmpq $32,%rdx
+ jae 103200b
+ cmpb $0,%dl
+ jne 10b
+ POP_FRAME_POINTER
+ ret
+
+10320016:
+ leaq 16(%rdi),%rdi
+ leaq 16(%rsi),%rsi
+10320000:
+/*
+ * Mismatch was found within a 16 bytes range. The part of the routine
+ * which calculates it only operates on sizes up to 8 bytes. Find the
+ * right part.
+ */
+ movq (%rdi),%r8
+ movq (%rsi),%r9
+ cmpq %r8,%r9
+ jne 1f
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jmp 1f
+10163224:
+ leaq -8(%rdi,%rdx),%rdi
+ leaq -8(%rsi,%rdx),%rsi
+ jmp 1f
+10163216:
+ leaq -16(%rdi,%rdx),%rdi
+ leaq -16(%rsi,%rdx),%rsi
+ jmp 1f
+10163208:
+10081608:
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jmp 1f
+
+/*
+ * Mismatch was found. We have no more than 8 bytes to inspect.
+ */
+ALIGN_TEXT
1:
- testq %rdx,%rdx
- je 3f
- xorl %ecx,%ecx
-2:
- movzbl (%rdi,%rcx,1),%eax
- movzbl (%rsi,%rcx,1),%r8d
+ movzbl (%rdi),%eax
+ movzbl (%rsi),%r8d
cmpb %r8b,%al
- jne 4f
- addq $1,%rcx
- cmpq %rcx,%rdx
- jz 3f
- movzbl (%rdi,%rcx,1),%eax
- movzbl (%rsi,%rcx,1),%r8d
+ jne 2f
+
+ movzbl 1(%rdi),%eax
+ movzbl 1(%rsi),%r8d
cmpb %r8b,%al
- jne 4f
- addq $1,%rcx
- cmpq %rcx,%rdx
- jz 3f
- movzbl (%rdi,%rcx,1),%eax
- movzbl (%rsi,%rcx,1),%r8d
+ jne 2f
+
+ movzbl 2(%rdi),%eax
+ movzbl 2(%rsi),%r8d
cmpb %r8b,%al
- jne 4f
- addq $1,%rcx
- cmpq %rcx,%rdx
- jz 3f
- movzbl (%rdi,%rcx,1),%eax
- movzbl (%rsi,%rcx,1),%r8d
+ jne 2f
+
+ movzbl 3(%rdi),%eax
+ movzbl 3(%rsi),%r8d
cmpb %r8b,%al
- jne 4f
- addq $1,%rcx
- cmpq %rcx,%rdx
- jne 2b
-3:
+ jne 2f
+
+ movzbl 4(%rdi),%eax
+ movzbl 4(%rsi),%r8d
+ cmpb %r8b,%al
+ jne 2f
+
+ movzbl 5(%rdi),%eax
+ movzbl 5(%rsi),%r8d
+ cmpb %r8b,%al
+ jne 2f
+
+ movzbl 6(%rdi),%eax
+ movzbl 6(%rsi),%r8d
+ cmpb %r8b,%al
+ jne 2f
+
+ movzbl 7(%rdi),%eax
+ movzbl 7(%rsi),%r8d
+ cmpb %r8b,%al
+ jne 2f
+
xorl %eax,%eax
POP_FRAME_POINTER
ret
-4:
+2:
subl %r8d,%eax
POP_FRAME_POINTER
ret
-5:
- cmpq $32,%rdx
- jae 7f
-6:
- /*
- * 8 bytes
- */
- movq (%rdi),%r8
- movq (%rsi),%r9
- cmpq %r8,%r9
- jne 1b
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
- subq $8,%rdx
- cmpq $8,%rdx
- jae 6b
- jl 1b
- jmp 3b
-7:
- /*
- * 32 bytes
- */
- movq (%rsi),%r8
- movq 8(%rsi),%r9
- subq (%rdi),%r8
- subq 8(%rdi),%r9
- or %r8,%r9
- jnz 1b
-
- movq 16(%rsi),%r8
- movq 24(%rsi),%r9
- subq 16(%rdi),%r8
- subq 24(%rdi),%r9
- or %r8,%r9
- jnz 1b
-
- leaq 32(%rdi),%rdi
- leaq 32(%rsi),%rsi
- subq $32,%rdx
- cmpq $32,%rdx
- jae 7b
- jnz 1b
- jmp 3b
END(memcmp)
/*
More information about the svn-src-all
mailing list