git: 5048c1b85506 - main - lib/libc/amd64/string: add timingsafe_memcmp() assembly implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Sun, 15 Oct 2023 19:31:20 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=5048c1b85506c5e0f441ee7dd98dd8d96d0a4a24

commit 5048c1b85506c5e0f441ee7dd98dd8d96d0a4a24
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-10-15 19:25:53 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-10-15 19:25:53 +0000

    lib/libc/amd64/string: add timingsafe_memcmp() assembly implementation
    
    Conceptually very similar to timingsafe_bcmp(), but with comparison
    logic inspired by Elijah Stone's fancy memcmp. A baseline (SSE)
    implementation was omitted this time as I was not able to get it to
    perform adequately.  Best I got was 8% over the scalar version for
    long inputs, but slower for short inputs.
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    security (cperciva)
    Inspired by:    https://github.com/moon-chilled/fancy-memcmp
    Differential Revision:  https://reviews.freebsd.org/D41696
---
 lib/libc/amd64/string/Makefile.inc        |   4 +-
 lib/libc/amd64/string/timingsafe_memcmp.S | 145 ++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index fc420de0450e..09bf7c8f251e 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -1,4 +1,3 @@
-
 MDSRCS+= \
 	amd64_archlevel.c \
 	bcmp.S \
@@ -16,4 +15,5 @@ MDSRCS+= \
 	strlen.S \
 	strnlen.c \
 	strspn.S \
-	timingsafe_bcmp.S
+	timingsafe_bcmp.S \
+	timingsafe_memcmp.S
diff --git a/lib/libc/amd64/string/timingsafe_memcmp.S b/lib/libc/amd64/string/timingsafe_memcmp.S
new file mode 100644
index 000000000000..3f1eccdbd640
--- /dev/null
+++ b/lib/libc/amd64/string/timingsafe_memcmp.S
@@ -0,0 +1,145 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#define ALIGN_TEXT	.p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+/* int timingsafe_memcmp(const void *rdi, const void *rsi, size_t rdx) */
+ENTRY(timingsafe_memcmp)
+	cmp	$16, %rdx		# at least 17 bytes to process?
+	ja	.Lgt16
+
+	cmp	$8, %edx		# at least 9 bytes to process?
+	ja	.L0916
+
+	cmp	$4, %edx		# at least 5 bytes to process?
+	ja	.L0508
+
+	cmp	$2, %edx		# at least 3 bytes to process?
+	ja	.L0304
+
+	test	%edx, %edx		# buffer empty?
+	jnz	.L0102
+
+	xor	%eax, %eax		# empty buffer always matches
+	ret
+
+.L0102:	movzbl	-1(%rdi, %rdx, 1), %eax	# load 1--2 bytes from first buffer
+	movzbl	-1(%rsi, %rdx, 1), %ecx
+	mov	(%rdi), %ah		# in big endian
+	mov	(%rsi), %ch
+	sub	%ecx, %eax
+	ret
+
+.L0304:	movzwl	-2(%rdi, %rdx, 1), %ecx
+	movzwl	-2(%rsi, %rdx, 1), %edx
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %esi
+	bswap	%ecx			# convert to big endian
+	bswap	%edx			# dito for edx, (e)ax, and (e)si
+	rol	$8, %ax			# ROLW is used here so the upper two
+	rol	$8, %si			# bytes stay clear, allowing us to
+	sub	%edx, %ecx		# save a SBB compared to .L0508
+	sbb	%esi, %eax
+	or	%eax, %ecx		# nonzero if not equal
+	setnz	%al
+	ret
+
+.L0508:	mov	-4(%rdi, %rdx, 1), %ecx
+	mov	-4(%rsi, %rdx, 1), %edx
+	mov	(%rdi), %edi
+	mov	(%rsi), %esi
+	bswap	%ecx			# compare in big endian
+	bswap	%edx
+	bswap	%edi
+	bswap	%esi
+	sub	%edx, %ecx
+	sbb	%esi, %edi
+	sbb	%eax, %eax		# -1 if less, 0 if greater or equal
+	or	%edi, %ecx		# nonzero if not equal
+	setnz	%al			# negative if <, 0 if =, 1 if >
+	ret
+
+.L0916:	mov	-8(%rdi, %rdx, 1), %rcx
+	mov	-8(%rsi, %rdx, 1), %rdx
+	mov	(%rdi), %rdi
+	mov	(%rsi), %rsi
+	bswap	%rcx			# compare in big endian
+	bswap	%rdx
+	bswap	%rdi
+	bswap	%rsi
+	sub	%rdx, %rcx
+	sbb	%rsi, %rdi
+	sbb	%eax, %eax		# -1 if less, 0 if greater or equal
+	or	%rdi, %rcx		# nonzero if not equal
+	setnz	%al			# negative if <, 0 if =, 1 if >
+	ret
+
+	/* compare 17+ bytes */
+.Lgt16:	mov	(%rdi), %r8		# process first 16 bytes
+	mov	(%rsi), %r9
+	mov	$32, %ecx
+	cmp	%r8, %r9		# mismatch in head?
+	cmove	8(%rdi), %r8		# if not, try second pair
+	cmove	8(%rsi), %r9
+	cmp	%rdx, %rcx
+	jae	.Ltail
+
+	/* main loop processing 16 bytes per iteration */
+	ALIGN_TEXT
+0:	mov	-16(%rdi, %rcx, 1), %r10
+	mov	-16(%rsi, %rcx, 1), %r11
+	cmp	%r10, %r11		# mismatch in first pair?
+	cmove	-8(%rdi, %rcx, 1), %r10	# if not, try second pair
+	cmove	-8(%rsi, %rcx, 1), %r11
+	cmp	%r8, %r9		# was there a mismatch previously?
+	cmove	%r10, %r8		# apply new pair if there was not
+	cmove	%r11, %r9
+	add	$16, %rcx
+	cmp	%rdx, %rcx
+	jb	0b
+
+.Ltail:	mov	-8(%rdi, %rdx, 1), %r10
+	mov	-8(%rsi, %rdx, 1), %r11
+	cmp	%r8, %r9
+	cmove	-16(%rdi, %rdx, 1), %r8
+	cmove	-16(%rsi, %rdx, 1), %r9
+	bswap	%r10			# compare in big endian
+	bswap	%r11
+	bswap	%r8
+	bswap	%r9
+	sub	%r11, %r10
+	sbb	%r9, %r8
+	sbb	%eax, %eax		# -1 if less, 0 if greater or equal
+	or	%r10, %r8		# nonzero if not equal
+	setnz	%al			# negative if <, 0 if =, 1 if >
+	ret
+END(timingsafe_memcmp)
+
+	.section .note.GNU-stack,"",%progbits