RFC: MI strlen()

Tue Jan 13 06:12:13 PST 2009

On Wed, 14 Jan 2009, I wrote:

> Also, it takes something like -fno-builtin-strlen in CFLAGS for the
> libc strlen() to ever be used.  On both amd64 and i386, gcc's builtin
> strlen() uses an inline "rep scasb", which can be inferior to even the

Actually, this is no longer true.  gcc-4.2 on at least amd64 has the
weird behaviour of disabling its "optimized" builtin strlen() with -O2
but not with -O1.  Even spelling strlen as __builtin_strlen doesn't
give the builtin.

>> Unpatched libc:
>>     1400.97 real      4159.34 user      2901.08 sys
>>     1396.73 real      4159.06 user      2906.16 sys
>>     1380.27 real      4158.20 user      2803.22 sys
>> 
>> Patched libc:
>>     1363.29 real      4154.89 user      2749.94 sys
>>     1373.96 real      4150.45 user      2830.46 sys
>>     1368.62 real      4152.48 user      2838.52 sys
>> 
>> (with 'make cleanworld' between builds)
>
> I don't believe this improvement.  First, it is far too large to have been
> caused by changing libc strlen().  Second, libc strlen() is not normally
> used.

Since -O2 is a standard "optimization" for makeworld, libc strlen() is now
normally used, at least on amd64 (i386 with certain -mcpu should be the
same, but I guess it isn't).

> My buildworld times on a 1x2core machine with nfs-mounted src/ and ffs
> obj/ src are much smaller:
>
>      824.96 real      1294.35 user       187.09 sys
>
> but this is for FreeBSD-~5.2.  FreeBSD and gcc have a combined bloat factor

Of course I don't use -O2.

> Here are some old files for too-simple strlen() benchmarks:
> ...
> I forget the results (don't have an amd64 machine handy for re-testing).

I reran the test.  I had to change -O2 to -O to test builtin strlen,
and made some other changes in a failed attempt to force __builtin_strlen
with -O2).  builtin strlen is much slower than I remembered.  The
NetBSD asm strlen (it uses the 0x8080 trick with 64-bit words) is
about 4.5 faster for long strings but slower for short strings (ones
shorter than the word size).  I expect your C version is similar.

%%%
string length 0:
algorithm -DSTRLEN=__builtin_strlen:
         0.35 real         0.35 user         0.00 sys
algorithm -fno-builtin:
         0.06 real         0.05 user         0.00 sys
algorithm -fno-builtin zq.S:
         0.09 real         0.09 user         0.00 sys
string length 1:
algorithm -DSTRLEN=__builtin_strlen:
         0.39 real         0.38 user         0.00 sys
algorithm -fno-builtin:
         0.07 real         0.06 user         0.00 sys
algorithm -fno-builtin zq.S:
         0.13 real         0.12 user         0.00 sys
string length 10:
algorithm -DSTRLEN=__builtin_strlen:
         0.68 real         0.66 user         0.01 sys
algorithm -fno-builtin:
         0.21 real         0.21 user         0.00 sys
algorithm -fno-builtin zq.S:
         0.12 real         0.10 user         0.00 sys
string length 100:
algorithm -DSTRLEN=__builtin_strlen:
         3.60 real         3.60 user         0.00 sys
algorithm -fno-builtin:
         1.02 real         1.01 user         0.00 sys
algorithm -fno-builtin zq.S:
         0.26 real         0.26 user         0.00 sys
string length 1000:
algorithm -DSTRLEN=__builtin_strlen:
        32.79 real        32.78 user         0.00 sys
algorithm -fno-builtin:
         7.25 real         7.25 user         0.00 sys
algorithm -fno-builtin zq.S:
         1.55 real         1.54 user         0.00 sys
%%%

%%%
#! /bin/sh
# This is a shell archive.  Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file".  To overwrite existing
# files, type "sh file -c".  You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g..  If this archive is complete, you
# will see the following message at the end:
#		"End of shell archive."
# Contents:  z.c zi zq.S
# Wrapped by root at besplex.bde.org on Wed Jan 14 01:05:11 2009
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'z.c' -a "${1}" != "-c" ; then
   echo shar: Will not clobber existing file \"'z.c'\"
else
echo shar: Extracting \"'z.c'\" \(463 characters\)
sed "s/^X//" >'z.c' <<'END_OF_FILE'
X#include <stdio.h>
X#include <stdlib.h>
X#include <string.h>
X
X#ifndef LEN
X#define	LEN	5
X#endif
X
X#ifndef STRLEN
X#define	STRLEN	strlen
X#endif
X
Xstatic char *foo[1000];
Xstatic volatile size_t s;
X
X#ifdef MISTRLEN
X#include "/usr/src/lib/libc/string/strlen.c"
X#endif
X
Xint
Xmain(void)
X{
X	int i;
X
X	for (i = 0; i < 10; i++) {
X		foo[i] = malloc(LEN + 1);
X		sprintf(foo[i], "%*.*s", LEN, LEN, "foo");
X	}
X	for (i = 0; i < 10000000; i++)
X		s = STRLEN(foo[i % 10]);
X	return (0);
X}
END_OF_FILE
if test 463 -ne `wc -c <'z.c'`; then
     echo shar: \"'z.c'\" unpacked with wrong size!
fi
# end of 'z.c'
fi
if test -f 'zi' -a "${1}" != "-c" ; then
   echo shar: Will not clobber existing file \"'zi'\"
else
echo shar: Extracting \"'zi'\" \(224 characters\)
sed "s/^X//" >'zi' <<'END_OF_FILE'
Xfor len in 0 1 10 100 1000
Xdo
X	echo "string length $len:"
X	for alg in -DSTRLEN=__builtin_strlen -fno-builtin "-fno-builtin zq.S"
X	do
X		echo "algorithm $alg:"
X		cc -o z z.c -O $alg -g -static -DLEN=$len
X		time ./z
X	done
Xdone
END_OF_FILE
if test 224 -ne `wc -c <'zi'`; then
     echo shar: \"'zi'\" unpacked with wrong size!
fi
# end of 'zi'
fi
if test -f 'zq.S' -a "${1}" != "-c" ; then
   echo shar: Will not clobber existing file \"'zq.S'\"
else
echo shar: Extracting \"'zq.S'\" \(4304 characters\)
sed "s/^X//" >'zq.S' <<'END_OF_FILE'
X/*
X * Written by J.T. Conklin <jtc at acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X__FBSDID("$FreeBSD$");
X
X#if 0
X	RCSID("$NetBSD: strlen.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
X#endif
X
XENTRY(strlen)
X	movq	%rdi,%rax
X	negq	%rdi
X
X.Lalign:
X	/* Consider unrolling loop? */
X	testb	$7,%al
X	je	.Lword_aligned
X	cmpb	$0,(%rax)
X	jne	1f
X	leaq	(%rdi,%rax),%rax
X	ret
X1:	incq	%rax
X	jmp	.Lalign
X
X	/*
X	 * There are many well known branch-free sequences which are used
X	 * for determining whether a zero-byte is contained within a word.
X	 * These sequences are generally much more efficent than loading
X	 * and comparing each byte individually.
X	 *
X	 * The expression [1,2]:
X	 *
X	 * (1)  ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
X	 *
X	 * evaluates to a non-zero value if any of the bytes in the
X	 * original word is zero.
X	 *
X	 * It also has the useful property that bytes in the result word
X	 * that coorespond to non-zero bytes in the original word have
X	 * the value 0x00, while bytes cooresponding to zero bytes have
X	 * the value 0x80. This allows calculation of the first (and
X	 * last) occurance of a zero byte within the word (useful for C's
X	 * str* primitives) by counting the number of leading (or
X	 * trailing) zeros and dividing the result by 8.  On machines
X	 * without (or with slow) clz() / ctz() instructions, testing
X	 * each byte in the result word for zero is necessary.
X	 *
X	 * This typically takes 4 instructions (5 on machines without
X	 * "not-or") not including those needed to load the constant.
X	 *
X	 *
X	 * The expression:
X	 *
X	 * (2)  ((x - 0x01....01) & ~x & 0x80....80)
X	 *
X	 * evaluates to a non-zero value if any of the bytes in the
X	 * original word is zero.
X	 *
X	 * On little endian machines, the first byte in the result word
X	 * that cooresponds to a zero byte in the original byte is 0x80,
X	 * so clz() can be used as above.  On big endian machines, and
X	 * little endian machines without (or with a slow) clz() insn,
X	 * testing each byte in the original for zero is necessary
X	 *
X	 * This typically takes 3 instructions (4 on machines without
X	 * "and with complement") not including those needed to load
X	 * constants.
X	 *
X	 *
X	 * The expression:
X	 *
X	 * (3)  ((x - 0x01....01) & 0x80....80)
X	 *
X	 * always evaluates to a non-zero value if any of the bytes in
X	 * the original word is zero.  However, in rare cases, it also
X	 * evaluates to a non-zero value when none of the bytes in the
X	 * original word is zero.
X	 *
X	 * To account for possible false positives, each byte of the
X	 * original word must be checked when the expression evaluates to
X	 * a non-zero value.  However, because it is simpler than those
X	 * presented above, code that uses it will be faster as long as
X	 * the rate of false positives is low.
X	 *
X	 * This is likely, because the the false positive can only occur
X	 * if the most siginificant bit of a byte within the word is set.
X	 * The expression will never fail for typical 7-bit ASCII strings.
X	 *
X	 * This typically takes 2 instructions not including those needed
X	 * to load constants.
X	 *
X	 *
X	 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
X	 *
X	 * [2] International Business Machines, "The PowerPC Compiler Writer's
X	 *     Guide", Warthman Associates, 1996
X	 */
X
X	.align 4
X.Lword_aligned:
X	movabsq	$0x0101010101010101,%r8
X	movabsq	$0x8080808080808080,%r9
X.Lloop:
X	movq	(%rax),%rdx
X	addq	$8,%rax
X	subq	%r8,%rdx
X	testq	%r9,%rdx
X	je	.Lloop
X
X	/*
X	 * In rare cases, the above loop may exit prematurely. We must
X	 * return to the loop if none of the bytes in the word equal 0.
X	 */
X	cmpb	$0,-8(%rax)		/* 1st byte == 0? */
X	je	.Lsub8
X	cmpb	$0,-7(%rax)		/* 2nd byte == 0? */
X	je	.Lsub7
X	cmpb	$0,-6(%rax)		/* 3rd byte == 0? */
X	je	.Lsub6
X	cmpb	$0,-5(%rax)		/* 4th byte == 0? */
X	je	.Lsub5
X	cmpb	$0,-4(%rax)		/* 5th byte == 0? */
X	je	.Lsub4
X	cmpb	$0,-3(%rax)		/* 6th byte == 0? */
X	je	.Lsub3
X	cmpb	$0,-2(%rax)		/* 7th byte == 0? */
X	je	.Lsub2
X	cmpb	$0,-1(%rax)		/* 8th byte == 0? */
X	jne	.Lloop
X
X.Lsub1:
X	leaq	-1(%rdi,%rax),%rax
X	ret
X.Lsub2:
X	leaq	-2(%rdi,%rax),%rax
X	ret
X.Lsub3:
X	leaq	-3(%rdi,%rax),%rax
X	ret
X.Lsub4:
X	leaq	-4(%rdi,%rax),%rax
X	ret
X.Lsub5:
X	leaq	-5(%rdi,%rax),%rax
X	ret
X.Lsub6:
X	leaq	-6(%rdi,%rax),%rax
X	ret
X.Lsub7:
X	leaq	-7(%rdi,%rax),%rax
X	ret
X.Lsub8:
X	leaq	-8(%rdi,%rax),%rax
X	ret
END_OF_FILE
if test 4304 -ne `wc -c <'zq.S'`; then
     echo shar: \"'zq.S'\" unpacked with wrong size!
fi
# end of 'zq.S'
fi
echo shar: End of shell archive.
exit 0
%%%

Bruce