svn commit: r340472 - in head: lib/libc/amd64/string sys/amd64/amd64
Mateusz Guzik
mjg at FreeBSD.org
Fri Nov 16 00:44:24 UTC 2018
Author: mjg
Date: Fri Nov 16 00:44:22 2018
New Revision: 340472
URL: https://svnweb.freebsd.org/changeset/base/340472
Log:
amd64: handle small memset buffers with overlapping stores
Instead of jumping to locations which store the exact number of bytes,
use displacement to move the destination.
In particular the following clears an area between 8-16 (inclusive)
branch-free:
movq %r10,(%rdi)
movq %r10,-8(%rdi,%rcx)
For instance for rcx of 10 the second line is rdi + 10 - 8 = rdi + 2.
Writing 8 bytes starting at that offset overlaps with 6 bytes written
previously and writes 2 new, giving 10 in total.
Provides a nice win for smaller stores. Other ones are erratic depending
on the microarchitecture.
General idea taken from NetBSD (restricted use of the trick) and bionic
string functions (use for various ranges like in this patch).
Reviewed by: kib (previous version)
Sponsored by: The FreeBSD Foundation
Differential Revision: https://reviews.freebsd.org/D17660
Modified:
head/lib/libc/amd64/string/memset.S
head/sys/amd64/amd64/support.S
Modified: head/lib/libc/amd64/string/memset.S
==============================================================================
--- head/lib/libc/amd64/string/memset.S Fri Nov 16 00:03:31 2018 (r340471)
+++ head/lib/libc/amd64/string/memset.S Fri Nov 16 00:44:22 2018 (r340472)
@@ -41,12 +41,12 @@ __FBSDID("$FreeBSD$");
imulq %r8,%r10
cmpq $32,%rcx
- jb 1016f
+ jbe 101632f
cmpq $256,%rcx
ja 1256f
-1032:
+103200:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %r10,16(%rdi)
@@ -54,43 +54,49 @@ __FBSDID("$FreeBSD$");
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
- jae 1032b
- cmpb $0,%cl
- je 1000f
-1016:
+ ja 103200b
cmpb $16,%cl
- jl 1008f
+ ja 201632f
+ movq %r10,-16(%rdi,%rcx)
+ movq %r10,-8(%rdi,%rcx)
+ ret
+ ALIGN_TEXT
+101632:
+ cmpb $16,%cl
+ jl 100816f
+201632:
movq %r10,(%rdi)
movq %r10,8(%rdi)
- subb $16,%cl
- jz 1000f
- leaq 16(%rdi),%rdi
-1008:
+ movq %r10,-16(%rdi,%rcx)
+ movq %r10,-8(%rdi,%rcx)
+ ret
+ ALIGN_TEXT
+100816:
cmpb $8,%cl
- jl 1004f
+ jl 100408f
movq %r10,(%rdi)
- subb $8,%cl
- jz 1000f
- leaq 8(%rdi),%rdi
-1004:
+ movq %r10,-8(%rdi,%rcx)
+ ret
+ ALIGN_TEXT
+100408:
cmpb $4,%cl
- jl 1002f
+ jl 100204f
movl %r10d,(%rdi)
- subb $4,%cl
- jz 1000f
- leaq 4(%rdi),%rdi
-1002:
+ movl %r10d,-4(%rdi,%rcx)
+ ret
+ ALIGN_TEXT
+100204:
cmpb $2,%cl
- jl 1001f
+ jl 100001f
movw %r10w,(%rdi)
- subb $2,%cl
- jz 1000f
- leaq 2(%rdi),%rdi
-1001:
- cmpb $1,%cl
- jl 1000f
+ movw %r10w,-2(%rdi,%rcx)
+ ret
+ ALIGN_TEXT
+100001:
+ cmpb $0,%cl
+ je 100000f
movb %r10b,(%rdi)
-1000:
+100000:
ret
ALIGN_TEXT
1256:
@@ -127,6 +133,7 @@ __FBSDID("$FreeBSD$");
leaq 16(%rdi,%r8),%rdi
jmp 1b
.endm
+
ENTRY(memset)
MEMSET erms=0
Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S Fri Nov 16 00:03:31 2018 (r340471)
+++ head/sys/amd64/amd64/support.S Fri Nov 16 00:44:22 2018 (r340472)
@@ -459,12 +459,12 @@ END(memcpy_erms)
imulq %r8,%r10
cmpq $32,%rcx
- jb 1016f
+ jbe 101632f
cmpq $256,%rcx
ja 1256f
-1032:
+103200:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %r10,16(%rdi)
@@ -472,43 +472,54 @@ END(memcpy_erms)
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
- jae 1032b
- cmpb $0,%cl
- je 1000f
-1016:
+ ja 103200b
cmpb $16,%cl
- jl 1008f
+ ja 201632f
+ movq %r10,-16(%rdi,%rcx)
+ movq %r10,-8(%rdi,%rcx)
+ POP_FRAME_POINTER
+ ret
+ ALIGN_TEXT
+101632:
+ cmpb $16,%cl
+ jl 100816f
+201632:
movq %r10,(%rdi)
movq %r10,8(%rdi)
- subb $16,%cl
- jz 1000f
- leaq 16(%rdi),%rdi
-1008:
+ movq %r10,-16(%rdi,%rcx)
+ movq %r10,-8(%rdi,%rcx)
+ POP_FRAME_POINTER
+ ret
+ ALIGN_TEXT
+100816:
cmpb $8,%cl
- jl 1004f
+ jl 100408f
movq %r10,(%rdi)
- subb $8,%cl
- jz 1000f
- leaq 8(%rdi),%rdi
-1004:
+ movq %r10,-8(%rdi,%rcx)
+ POP_FRAME_POINTER
+ ret
+ ALIGN_TEXT
+100408:
cmpb $4,%cl
- jl 1002f
+ jl 100204f
movl %r10d,(%rdi)
- subb $4,%cl
- jz 1000f
- leaq 4(%rdi),%rdi
-1002:
+ movl %r10d,-4(%rdi,%rcx)
+ POP_FRAME_POINTER
+ ret
+ ALIGN_TEXT
+100204:
cmpb $2,%cl
- jl 1001f
+ jl 100001f
movw %r10w,(%rdi)
- subb $2,%cl
- jz 1000f
- leaq 2(%rdi),%rdi
-1001:
- cmpb $1,%cl
- jl 1000f
+ movw %r10w,-2(%rdi,%rcx)
+ POP_FRAME_POINTER
+ ret
+ ALIGN_TEXT
+100001:
+ cmpb $0,%cl
+ je 100000f
movb %r10b,(%rdi)
-1000:
+100000:
POP_FRAME_POINTER
ret
ALIGN_TEXT
More information about the svn-src-all
mailing list