svn commit: r339205 - head/sys/amd64/amd64
Mateusz Guzik
mjg at FreeBSD.org
Fri Oct 5 19:25:10 UTC 2018
Author: mjg
Date: Fri Oct 5 19:25:09 2018
New Revision: 339205
URL: https://svnweb.freebsd.org/changeset/base/339205
Log:
amd64: make memset less slow with mov
rep stos has a high startup time even on modern microarchitectures like
Skylake. Intel optimization manuals discuss how for small sizes it is
beneficial to go for streaming stores. Since those cannot be used without
extra penalty in the kernel I investigated performance impact of just
regular movs.
The patch below implements a very simple scheme: a 32-byte loop followed
by filling in the remainder of at most 31 bytes. It has a 256 breaking
point on which it falls back to rep stos. It provides a significant win
over the current primitive on several machines I tested (both Intel and
AMD). A 64-byte loop did not provide any benefit even for multiple of 64
sizes.
See the review for benchmark data.
Reviewed by: kib
Approved by: re (gjb)
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D17398
Modified:
head/sys/amd64/amd64/support.S
Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S Fri Oct 5 18:15:44 2018 (r339204)
+++ head/sys/amd64/amd64/support.S Fri Oct 5 19:25:09 2018 (r339205)
@@ -320,43 +320,92 @@ END(memcpy_erms)
* memset(dst, c, len)
* rdi, rsi, rdx
*/
-ENTRY(memset_std)
+.macro MEMSET erms
PUSH_FRAME_POINTER
movq %rdi,%r9
movq %rdx,%rcx
movzbq %sil,%r8
movabs $0x0101010101010101,%rax
imulq %r8,%rax
- cmpq $15,%rcx
- jbe 1f
- shrq $3,%rcx
- rep
- stosq
- movq %rdx,%rcx
- andq $7,%rcx
- jne 1f
+
+ cmpq $32,%rcx
+ jb 1016f
+
+ cmpq $256,%rcx
+ ja 1256f
+
+1032:
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+ movq %rax,24(%rdi)
+ leaq 32(%rdi),%rdi
+ subq $32,%rcx
+ cmpq $32,%rcx
+ jae 1032b
+ cmpb $0,%cl
+ je 1000f
+1016:
+ cmpb $16,%cl
+ jl 1008f
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ subb $16,%cl
+ jz 1000f
+ leaq 16(%rdi),%rdi
+1008:
+ cmpb $8,%cl
+ jl 1004f
+ movq %rax,(%rdi)
+ subb $8,%cl
+ jz 1000f
+ leaq 8(%rdi),%rdi
+1004:
+ cmpb $4,%cl
+ jl 1002f
+ movl %eax,(%rdi)
+ subb $4,%cl
+ jz 1000f
+ leaq 4(%rdi),%rdi
+1002:
+ cmpb $2,%cl
+ jl 1001f
+ movw %ax,(%rdi)
+ subb $2,%cl
+ jz 1000f
+ leaq 2(%rdi),%rdi
+1001:
+ cmpb $1,%cl
+ jl 1000f
+ movb %al,(%rdi)
+1000:
movq %r9,%rax
POP_FRAME_POINTER
ret
ALIGN_TEXT
-1:
+1256:
+.if \erms == 1
rep
stosb
+.else
+ shrq $3,%rcx
+ rep
+ stosq
+ movq %rdx,%rcx
+ andb $7,%cl
+ jne 1004b
+.endif
movq %r9,%rax
POP_FRAME_POINTER
ret
+.endm
+
+ENTRY(memset_std)
+ MEMSET erms=0
END(memset_std)
ENTRY(memset_erms)
- PUSH_FRAME_POINTER
- movq %rdi,%r9
- movq %rdx,%rcx
- movb %sil,%al
- rep
- stosb
- movq %r9,%rax
- POP_FRAME_POINTER
- ret
+ MEMSET erms=1
END(memset_erms)
/* fillw(pat, base, cnt) */
More information about the svn-src-all
mailing list