svn commit: r340472 - in head: lib/libc/amd64/string sys/amd64/amd64

Fri Nov 16 00:44:24 UTC 2018

Author: mjg
Date: Fri Nov 16 00:44:22 2018
New Revision: 340472
URL: https://svnweb.freebsd.org/changeset/base/340472

Log:
  amd64: handle small memset buffers with overlapping stores
  
  Instead of jumping to locations which store the exact number of bytes,
  use displacement to move the destination.
  
  In particular the following clears an area between 8-16 (inclusive)
  branch-free:
  
  movq    %r10,(%rdi)
  movq    %r10,-8(%rdi,%rcx)
  
  For instance for rcx of 10 the second line is rdi + 10 - 8 = rdi + 2.
  Writing 8 bytes starting at that offset overlaps with 6 bytes written
  previously and writes 2 new, giving 10 in total.
  
  Provides a nice win for smaller stores. Other ones are erratic depending
  on the microarchitecture.
  
  General idea taken from NetBSD (restricted use of the trick) and bionic
  string functions (use for various ranges like in this patch).
  
  Reviewed by:	kib (previous version)
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D17660

Modified:
  head/lib/libc/amd64/string/memset.S
  head/sys/amd64/amd64/support.S

Modified: head/lib/libc/amd64/string/memset.S
==============================================================================

--- head/lib/libc/amd64/string/memset.S	Fri Nov 16 00:03:31 2018	(r340471)
+++ head/lib/libc/amd64/string/memset.S	Fri Nov 16 00:44:22 2018	(r340472)
@@ -41,12 +41,12 @@ __FBSDID("$FreeBSD$");
 	imulq	%r8,%r10
 
 	cmpq	$32,%rcx
-	jb	1016f
+	jbe	101632f
 
 	cmpq	$256,%rcx
 	ja	1256f
 
-1032:
+103200:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%r10,16(%rdi)
@@ -54,43 +54,49 @@ __FBSDID("$FreeBSD$");
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
-	jae	1032b
-	cmpb	$0,%cl
-	je	1000f
-1016:
+	ja	103200b
 	cmpb	$16,%cl
-	jl	1008f
+	ja	201632f
+	movq	%r10,-16(%rdi,%rcx)
+	movq	%r10,-8(%rdi,%rcx)
+	ret
+	ALIGN_TEXT
+101632:
+	cmpb	$16,%cl
+	jl	100816f
+201632:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
-	subb	$16,%cl
-	jz	1000f
-	leaq	16(%rdi),%rdi
-1008:
+	movq	%r10,-16(%rdi,%rcx)
+	movq	%r10,-8(%rdi,%rcx)
+	ret
+	ALIGN_TEXT
+100816:
 	cmpb	$8,%cl
-	jl	1004f
+	jl	100408f
 	movq	%r10,(%rdi)
-	subb	$8,%cl
-	jz	1000f
-	leaq	8(%rdi),%rdi
-1004:
+	movq	%r10,-8(%rdi,%rcx)
+	ret
+	ALIGN_TEXT
+100408:
 	cmpb	$4,%cl
-	jl	1002f
+	jl	100204f
 	movl	%r10d,(%rdi)
-	subb	$4,%cl
-	jz	1000f
-	leaq	4(%rdi),%rdi
-1002:
+	movl	%r10d,-4(%rdi,%rcx)
+	ret
+	ALIGN_TEXT
+100204:
 	cmpb	$2,%cl
-	jl	1001f
+	jl	100001f
 	movw	%r10w,(%rdi)
-	subb	$2,%cl
-	jz	1000f
-	leaq	2(%rdi),%rdi
-1001:
-	cmpb	$1,%cl
-	jl	1000f
+	movw	%r10w,-2(%rdi,%rcx)
+	ret
+	ALIGN_TEXT
+100001:
+	cmpb	$0,%cl
+	je	100000f
 	movb	%r10b,(%rdi)
-1000:
+100000:
 	ret
 	ALIGN_TEXT
 1256:
@@ -127,6 +133,7 @@ __FBSDID("$FreeBSD$");
 	leaq	16(%rdi,%r8),%rdi
 	jmp	1b
 .endm
+
 
 ENTRY(memset)
 	MEMSET erms=0

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S	Fri Nov 16 00:03:31 2018	(r340471)
+++ head/sys/amd64/amd64/support.S	Fri Nov 16 00:44:22 2018	(r340472)
@@ -459,12 +459,12 @@ END(memcpy_erms)
 	imulq	%r8,%r10
 
 	cmpq	$32,%rcx
-	jb	1016f
+	jbe	101632f
 
 	cmpq	$256,%rcx
 	ja	1256f
 
-1032:
+103200:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%r10,16(%rdi)
@@ -472,43 +472,54 @@ END(memcpy_erms)
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
-	jae	1032b
-	cmpb	$0,%cl
-	je	1000f
-1016:
+	ja	103200b
 	cmpb	$16,%cl
-	jl	1008f
+	ja	201632f
+	movq	%r10,-16(%rdi,%rcx)
+	movq	%r10,-8(%rdi,%rcx)
+	POP_FRAME_POINTER
+	ret
+	ALIGN_TEXT
+101632:
+	cmpb	$16,%cl
+	jl	100816f
+201632:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
-	subb	$16,%cl
-	jz	1000f
-	leaq	16(%rdi),%rdi
-1008:
+	movq	%r10,-16(%rdi,%rcx)
+	movq	%r10,-8(%rdi,%rcx)
+	POP_FRAME_POINTER
+	ret
+	ALIGN_TEXT
+100816:
 	cmpb	$8,%cl
-	jl	1004f
+	jl	100408f
 	movq	%r10,(%rdi)
-	subb	$8,%cl
-	jz	1000f
-	leaq	8(%rdi),%rdi
-1004:
+	movq	%r10,-8(%rdi,%rcx)
+	POP_FRAME_POINTER
+	ret
+	ALIGN_TEXT
+100408:
 	cmpb	$4,%cl
-	jl	1002f
+	jl	100204f
 	movl	%r10d,(%rdi)
-	subb	$4,%cl
-	jz	1000f
-	leaq	4(%rdi),%rdi
-1002:
+	movl	%r10d,-4(%rdi,%rcx)
+	POP_FRAME_POINTER
+	ret
+	ALIGN_TEXT
+100204:
 	cmpb	$2,%cl
-	jl	1001f
+	jl	100001f
 	movw	%r10w,(%rdi)
-	subb	$2,%cl
-	jz	1000f
-	leaq	2(%rdi),%rdi
-1001:
-	cmpb	$1,%cl
-	jl	1000f
+	movw	%r10w,-2(%rdi,%rcx)
+	POP_FRAME_POINTER
+	ret
+	ALIGN_TEXT
+100001:
+	cmpb	$0,%cl
+	je	100000f
 	movb	%r10b,(%rdi)
-1000:
+100000:
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT