git: 8a680912a190 - stable/13 - Revert "Reimplement strlen"

Wed Feb 3 19:39:57 UTC 2021

The branch stable/13 has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=8a680912a190d619be6194f88917052eed327f7b

commit 8a680912a190d619be6194f88917052eed327f7b
Author:     Mateusz Guzik <mjg at FreeBSD.org>
AuthorDate: 2021-02-03 19:38:10 +0000
Commit:     Mateusz Guzik <mjg at FreeBSD.org>
CommitDate: 2021-02-03 19:39:49 +0000

    Revert "Reimplement strlen"
    
    This reverts commit 710e45c4b8539d028877769f1a4ec088c48fb5f1.
    
    It breaks for some corner cases on big endian ppc64.
    Given the stage of the release process it is best to revert for now.
    
    Reported by:    jhibbits
    
    (cherry picked from commit 33f0540b13d949c7cc226a79927ddc2062ff98bf)
---
 lib/libc/string/strlen.c | 82 +++++++++++++++++++++++++++++++-----------------
 sys/libkern/strlen.c     | 79 +++++++++++++++++++++++++++++++---------------
 2 files changed, 108 insertions(+), 53 deletions(-)

diff --git a/lib/libc/string/strlen.c b/lib/libc/string/strlen.c
index 0bdc81d7bb9a..a862ffc245ca 100644
--- a/lib/libc/string/strlen.c
+++ b/lib/libc/string/strlen.c
@@ -35,6 +35,10 @@ __FBSDID("$FreeBSD$");
 /*
  * Portable strlen() for 32-bit and 64-bit systems.
  *
+ * Rationale: it is generally much more efficient to do word length
+ * operations and avoid branches on modern computer systems, as
+ * compared to byte-length operations with a lot of branches.
+ *
  * The expression:
  *
  *	((x - 0x01....01) & ~x & 0x80....80)
@@ -42,13 +46,18 @@ __FBSDID("$FreeBSD$");
  * would evaluate to a non-zero value iff any of the bytes in the
  * original word is zero.
  *
+ * On multi-issue processors, we can divide the above expression into:
+ *	a)  (x - 0x01....01)
+ *	b) (~x & 0x80....80)
+ *	c) a & b
+ *
+ * Where, a) and b) can be partially computed in parallel.
+ *
  * The algorithm above is found on "Hacker's Delight" by
  * Henry S. Warren, Jr.
- *
- * Note: this leaves performance on the table and each architecture
- * would be best served with a tailor made routine instead.
  */
 
+/* Magic numbers for the algorithm */
 #if LONG_BIT == 32
 static const unsigned long mask01 = 0x01010101;
 static const unsigned long mask80 = 0x80808080;
@@ -61,45 +70,62 @@ static const unsigned long mask80 = 0x8080808080808080;
 
 #define	LONGPTR_MASK (sizeof(long) - 1)
 
-#if BYTE_ORDER == LITTLE_ENDIAN
-#define	FINDZERO __builtin_ctzl
-#else
-#define	FINDZERO __builtin_clzl
-#endif
+/*
+ * Helper macro to return string length if we caught the zero
+ * byte.
+ */
+#define testbyte(x)				\
+	do {					\
+		if (p[x] == '\0')		\
+		    return (p - str + x);	\
+	} while (0)
 
 size_t
 strlen(const char *str)
 {
+	const char *p;
 	const unsigned long *lp;
-	unsigned long mask;
 	long va, vb;
-	long val;
 
-	lp = (unsigned long *) (uintptr_t) str;
-	if ((uintptr_t)lp & LONGPTR_MASK) {
-		lp = (__typeof(lp)) ((uintptr_t)lp & ~LONGPTR_MASK);
-#if BYTE_ORDER == LITTLE_ENDIAN
-		mask = ~(~0UL << (((uintptr_t)str & LONGPTR_MASK) << 3));
-#else
-		mask = ~(~0UL >> (((uintptr_t)str & LONGPTR_MASK) << 3));
-#endif
-		val = *lp | mask;
-		va = (val - mask01);
-		vb = ((~val) & mask80);
-		if (va & vb) {
-			return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
-		}
-		lp++;
-	}
+	/*
+	 * Before trying the hard (unaligned byte-by-byte access) way
+	 * to figure out whether there is a nul character, try to see
+	 * if there is a nul character is within this accessible word
+	 * first.
+	 *
+	 * p and (p & ~LONGPTR_MASK) must be equally accessible since
+	 * they always fall in the same memory page, as long as page
+	 * boundaries is integral multiple of word size.
+	 */
+	lp = (const unsigned long *)((uintptr_t)str & ~LONGPTR_MASK);
+	va = (*lp - mask01);
+	vb = ((~*lp) & mask80);
+	lp++;
+	if (va & vb)
+		/* Check if we have \0 in the first part */
+		for (p = str; p < (const char *)lp; p++)
+			if (*p == '\0')
+				return (p - str);
 
+	/* Scan the rest of the string using word sized operation */
 	for (; ; lp++) {
 		va = (*lp - mask01);
 		vb = ((~*lp) & mask80);
 		if (va & vb) {
-			return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
+			p = (const char *)(lp);
+			testbyte(0);
+			testbyte(1);
+			testbyte(2);
+			testbyte(3);
+#if (LONG_BIT >= 64)
+			testbyte(4);
+			testbyte(5);
+			testbyte(6);
+			testbyte(7);
+#endif
 		}
 	}
 
-	__builtin_unreachable();
+	/* NOTREACHED */
 	return (0);
 }
diff --git a/sys/libkern/strlen.c b/sys/libkern/strlen.c
index 8fa5f3927ea9..a8c7964f69a3 100644
--- a/sys/libkern/strlen.c
+++ b/sys/libkern/strlen.c
@@ -34,6 +34,10 @@ __FBSDID("$FreeBSD$");
 /*
  * Portable strlen() for 32-bit and 64-bit systems.
  *
+ * Rationale: it is generally much more efficient to do word length
+ * operations and avoid branches on modern computer systems, as
+ * compared to byte-length operations with a lot of branches.
+ *
  * The expression:
  *
  *	((x - 0x01....01) & ~x & 0x80....80)
@@ -41,10 +45,18 @@ __FBSDID("$FreeBSD$");
  * would evaluate to a non-zero value iff any of the bytes in the
  * original word is zero.
  *
+ * On multi-issue processors, we can divide the above expression into:
+ *	a)  (x - 0x01....01)
+ *	b) (~x & 0x80....80)
+ *	c) a & b
+ *
+ * Where, a) and b) can be partially computed in parallel.
+ *
  * The algorithm above is found on "Hacker's Delight" by
  * Henry S. Warren, Jr.
  */
 
+/* Magic numbers for the algorithm */
 #if LONG_BIT == 32
 static const unsigned long mask01 = 0x01010101;
 static const unsigned long mask80 = 0x80808080;
@@ -57,45 +69,62 @@ static const unsigned long mask80 = 0x8080808080808080;
 
 #define	LONGPTR_MASK (sizeof(long) - 1)
 
-#if BYTE_ORDER == LITTLE_ENDIAN
-#define	FINDZERO __builtin_ctzl
-#else
-#define	FINDZERO __builtin_clzl
-#endif
+/*
+ * Helper macro to return string length if we caught the zero
+ * byte.
+ */
+#define testbyte(x)				\
+	do {					\
+		if (p[x] == '\0')		\
+		    return (p - str + x);	\
+	} while (0)
 
 size_t
 (strlen)(const char *str)
 {
+	const char *p;
 	const unsigned long *lp;
-	unsigned long mask;
 	long va, vb;
-	long val;
 
-	lp = (unsigned long *) (uintptr_t) str;
-	if ((uintptr_t)lp & LONGPTR_MASK) {
-		lp = (__typeof(lp)) ((uintptr_t)lp & ~LONGPTR_MASK);
-#if BYTE_ORDER == LITTLE_ENDIAN
-		mask = ~(~0UL << (((uintptr_t)str & LONGPTR_MASK) << 3));
-#else
-		mask = ~(~0UL >> (((uintptr_t)str & LONGPTR_MASK) << 3));
-#endif
-		val = *lp | mask;
-		va = (val - mask01);
-		vb = ((~val) & mask80);
-		if (va & vb) {
-			return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
-		}
-		lp++;
-	}
+	/*
+	 * Before trying the hard (unaligned byte-by-byte access) way
+	 * to figure out whether there is a nul character, try to see
+	 * if there is a nul character is within this accessible word
+	 * first.
+	 *
+	 * p and (p & ~LONGPTR_MASK) must be equally accessible since
+	 * they always fall in the same memory page, as long as page
+	 * boundaries is integral multiple of word size.
+	 */
+	lp = (const unsigned long *)((uintptr_t)str & ~LONGPTR_MASK);
+	va = (*lp - mask01);
+	vb = ((~*lp) & mask80);
+	lp++;
+	if (va & vb)
+		/* Check if we have \0 in the first part */
+		for (p = str; p < (const char *)lp; p++)
+			if (*p == '\0')
+				return (p - str);
 
+	/* Scan the rest of the string using word sized operation */
 	for (; ; lp++) {
 		va = (*lp - mask01);
 		vb = ((~*lp) & mask80);
 		if (va & vb) {
-			return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
+			p = (const char *)(lp);
+			testbyte(0);
+			testbyte(1);
+			testbyte(2);
+			testbyte(3);
+#if (LONG_BIT >= 64)
+			testbyte(4);
+			testbyte(5);
+			testbyte(6);
+			testbyte(7);
+#endif
 		}
 	}
 
-	__builtin_unreachable();
+	/* NOTREACHED */
 	return (0);
 }