git: 4b15965daa99 - main - libc/amd64: rewrite memrchr() baseline impl. to read the string from the back

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Sat, 09 Aug 2025 20:14:10 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=4b15965daa99044daf184221b7c283bf7f2d7e66

commit 4b15965daa99044daf184221b7c283bf7f2d7e66
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2025-07-29 20:12:32 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-08-09 20:13:27 +0000

    libc/amd64: rewrite memrchr() baseline impl. to read the string from the back
    
    This ensures O(1) behaviour if the character is a constant offset
    from the end of the string, regardless of how long the string is.
    
    Reported by:    Mikael Simonsson <m@mikaelsimonsson.com>
    Reviewed by:    benni
    PR:             288321
    MFC after:      1 month
---
 lib/libc/amd64/string/memrchr.S | 152 +++++++++++++++++++---------------------
 1 file changed, 74 insertions(+), 78 deletions(-)

diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S
index 4f6c5a238daa..f1ba48d6bb41 100644
--- a/lib/libc/amd64/string/memrchr.S
+++ b/lib/libc/amd64/string/memrchr.S
@@ -1,7 +1,7 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
- * Copyright (c) 2023 Robert Clausecker
+ * Copyright (c) 2023, 2025 Robert Clausecker <fuz@FreeBSD.org>
  */
 
 #include <machine/asm.h>
@@ -71,95 +71,91 @@ ARCHENTRY(memrchr, scalar)
 ARCHEND(memrchr, scalar)
 
 ARCHENTRY(memrchr, baseline)
-	movd		%esi, %xmm4
-	test		%rdx, %rdx		# empty buffer?
-	jz		.L0			# if yes, return immediately
+	test		%rdx, %rdx		# empty input?
+	je		.Lnomatchb
+
+
+	lea		(%rdi, %rdx, 1), %ecx	# pointer to end of buffer
+	lea		-1(%rdi, %rdx, 1), %rdx	# pointer to last char in buffer
+	movd		%esi, %xmm2
+	and		$~0x1f, %rdx		# pointer to final 32 buffer bytes
+	movdqa		(%rdx), %xmm0		# load last 32 bytes
+	movdqa		16(%rdx), %xmm1
+
+	punpcklbw	%xmm2, %xmm2		# c -> cc
 
-	punpcklbw	%xmm4, %xmm4		# c -> cc
-	mov		%edi, %ecx
-	punpcklwd	%xmm4, %xmm4		# cc -> cccc
-	and		$~0xf, %rdi		# align source pointer
-	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
-	and		$0xf, %ecx
-	movdqa		%xmm4, %xmm0
 	mov		$-1, %r8d
-	pcmpeqb		(%rdi), %xmm0		# compare aligned head
-	shl		%cl, %r8d		# mask of bytes in the head of the buffer
-	pmovmskb	%xmm0, %eax
+	neg		%ecx
+	mov		%r8d, %r9d
+	shr		%cl, %r8d		# mask with zeroes after the string
 
-	sub		$16, %rcx
-	and		%r8d, %eax		# match mask
-	add		%rcx, %rdx		# advance past head
-	cmc
-	jbe		.Lrunt			# did the string end in the buffer?
+	punpcklwd	%xmm2, %xmm2		# cc -> cccc
 
-	mov		%rdi, %rsi		# pointer to matching chunk
-	add		$16, %rdi
-	sub		$16, %rdx		# enough left for another round?
-	jbe		1f
+	mov		%edi, %ecx
+	mov		%r9d, %eax
+	shl		%cl, %r9d		# mask with zeroes before the string
 
-	/* main loop unrolled twice */
-	ALIGN_TEXT
-0:	movdqa		%xmm4, %xmm0
-	pcmpeqb		(%rdi), %xmm0
-	pmovmskb	%xmm0, %r8d
+	pshufd		$0, %xmm2, %xmm2	# cccc -> cccccccccccccccc
 
-	cmp		$16, %rdx		# enough left for second chunk?
-	jbe		2f
+	cmp		%rdx, %rdi		# tail is beginning of buffer?
+	cmovae		%r9d, %eax		# if yes, do combined head/tail processing
+	and		%r8d, %eax		# mak of bytes in tail part of string
 
-	movdqa		%xmm4, %xmm0
-	pcmpeqb		16(%rdi), %xmm0
+	/* process tail */
+	pcmpeqb		%xmm2, %xmm1
+	pcmpeqb		%xmm2, %xmm0
+	pmovmskb	%xmm1, %esi
 	pmovmskb	%xmm0, %ecx
+	shl		$16, %esi
+	or		%esi, %ecx		# locations of matches
+	and		%ecx, %eax		# any match inside buffer?
+	jnz		.Lprecisematchb
 
-	lea		16(%rdi), %r9
-	test		%ecx, %ecx		# match found in second chunk?
-	cmovz		%r8d, %ecx		# if not, use match data from first chunk
-	cmovz		%rdi, %r9
-
-	test		%ecx, %ecx		# any match found?
-	cmovnz		%ecx, %eax		# if yes, overwrite previously found match
-	cmovnz		%r9, %rsi
-
-	add		$32, %rdi		# advance to next iteration
-	sub		$32, %rdx		# advance to next chunks
-	ja		0b
-
-	/* process remaining 1--16 bytes */
-1:	pcmpeqb		(%rdi), %xmm4
-	mov		$0xffff, %r8d
-	xor		%ecx, %ecx
-	sub		%edx, %ecx		# number of bytes to be masked out
-	pmovmskb	%xmm4, %r9d
-	shr		%cl, %r8d		# mask of bytes to be kept in the buffer
-	and		%r9d, %r8d
-	cmovnz		%r8d, %eax
-	cmovnz		%rdi, %rsi
-	bsr		%eax, %eax
-	lea		(%rsi, %rax, 1), %rsi	# pointer to match (or junk)
-	cmovnz		%rsi, %rax		# if any match was found, return it
-	ret
+	cmp		%rdx, %rdi		# did the buffer begin here?
+	jae		.Lnomatchb		# if yes, we are done
 
-	/* end of chunk reached within first half iteration */
-2:	test		%r8d, %r8d		# match in previous chunk?
-	cmovnz		%r8d, %eax		# if yes, overwrite previous chunks
-	cmovnz		%rdi, %rsi
-	add		$16, %rdi		# point to tail
-	sub		$16, %edx
-	jmp		1b			# handle tail the same otherwise
-
-	/* runt: string ends within head, edx has negated amount of invalid head bytes */
-.Lrunt:	mov		$0xffff, %r8d
-	xor		%ecx, %ecx
-	sub		%edx, %ecx
-	shr		%cl, %r8d
-	and		%r8d, %eax
-	bsr		%eax, %eax
-	lea		(%rdi, %rax, 1), %rdi
-	cmovnz		%rdi, %rax
+	/* main loop */
+	ALIGN_TEXT
+0:	movdqa		-32(%rdx), %xmm0	# load previous string chunk
+	movdqa		-16(%rdx), %xmm1
+	sub		$32, %rdx		# beginning of string reached?
+	cmp		%rdx, %rdi
+	jae		.Ltailb
+
+	pcmpeqb		%xmm2, %xmm0
+	pcmpeqb		%xmm2, %xmm1
+	por		%xmm1, %xmm0		# match in either half?
+	pmovmskb	%xmm0, %eax
+	test		%eax, %eax
+	jz		0b
+
+.Lmatchb:
+	pcmpeqb		(%rdx), %xmm2		# redo comparison of first 16 bytes
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm2, %eax
+	shl		$16, %ecx
+	or		%ecx, %eax		# location of matches
+
+.Lprecisematchb:
+	bsr		%eax, %eax		# find location of match
+	add		%rdx, %rax		# point to matching byte
 	ret
 
-	/* empty buffer: return a null pointer */
-.L0:	xor		%eax, %eax
+.Ltailb:
+	pcmpeqb		%xmm2, %xmm1
+	pcmpeqb		%xmm2, %xmm0
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm0, %eax
+	shl		$16, %ecx
+	or		%ecx, %eax		# location of matches
+	and		%r9d, %eax		# mask out matches before buffer
+	bsr		%eax, %edi		# location of match
+	lea		(%rdx, %rdi, 1), %rdx	# pointer to match (if any)
+	cmovnz		%rdx, %rax		# point to match if present,
+	ret					# else null pointer
+
+.Lnomatchb:
+	xor		%eax, %eax		# return null pointer
 	ret
 ARCHEND(memrchr, baseline)