RFC: MI strlen()
Bruce Evans
brde at optusnet.com.au
Tue Jan 13 06:12:13 PST 2009
On Wed, 14 Jan 2009, I wrote:
> Also, it takes something like -fno-builtin-strlen in CFLAGS for the
> libc strlen() to ever be used. On both amd64 and i386, gcc's builtin
> strlen() uses an inline "rep scasb", which can be inferior to even the
Actually, this is no longer true. gcc-4.2 on at least amd64 has the
weird behaviour of disabling its "optimized" builtin strlen() with -O2
but not with -O1. Even spelling strlen as __builtin_strlen doesn't
give the builtin.
>> Unpatched libc:
>> 1400.97 real 4159.34 user 2901.08 sys
>> 1396.73 real 4159.06 user 2906.16 sys
>> 1380.27 real 4158.20 user 2803.22 sys
>>
>> Patched libc:
>> 1363.29 real 4154.89 user 2749.94 sys
>> 1373.96 real 4150.45 user 2830.46 sys
>> 1368.62 real 4152.48 user 2838.52 sys
>>
>> (with 'make cleanworld' between builds)
>
> I don't believe this improvement. First, it is far too large to have been
> caused by changing libc strlen(). Second, libc strlen() is not normally
> used.
Since -O2 is a standard "optimization" for makeworld, libc strlen() is now
normally used, at least on amd64 (i386 with certain -mcpu should be the
same, but I guess it isn't).
> My buildworld times on a 1x2core machine with nfs-mounted src/ and ffs
> obj/ src are much smaller:
>
> 824.96 real 1294.35 user 187.09 sys
>
> but this is for FreeBSD-~5.2. FreeBSD and gcc have a combined bloat factor
Of course I don't use -O2.
> Here are some old files for too-simple strlen() benchmarks:
> ...
> I forget the results (don't have an amd64 machine handy for re-testing).
I reran the test. I had to change -O2 to -O to test builtin strlen,
and made some other changes in a failed attempt to force __builtin_strlen
with -O2). builtin strlen is much slower than I remembered. The
NetBSD asm strlen (it uses the 0x8080 trick with 64-bit words) is
about 4.5 faster for long strings but slower for short strings (ones
shorter than the word size). I expect your C version is similar.
%%%
string length 0:
algorithm -DSTRLEN=__builtin_strlen:
0.35 real 0.35 user 0.00 sys
algorithm -fno-builtin:
0.06 real 0.05 user 0.00 sys
algorithm -fno-builtin zq.S:
0.09 real 0.09 user 0.00 sys
string length 1:
algorithm -DSTRLEN=__builtin_strlen:
0.39 real 0.38 user 0.00 sys
algorithm -fno-builtin:
0.07 real 0.06 user 0.00 sys
algorithm -fno-builtin zq.S:
0.13 real 0.12 user 0.00 sys
string length 10:
algorithm -DSTRLEN=__builtin_strlen:
0.68 real 0.66 user 0.01 sys
algorithm -fno-builtin:
0.21 real 0.21 user 0.00 sys
algorithm -fno-builtin zq.S:
0.12 real 0.10 user 0.00 sys
string length 100:
algorithm -DSTRLEN=__builtin_strlen:
3.60 real 3.60 user 0.00 sys
algorithm -fno-builtin:
1.02 real 1.01 user 0.00 sys
algorithm -fno-builtin zq.S:
0.26 real 0.26 user 0.00 sys
string length 1000:
algorithm -DSTRLEN=__builtin_strlen:
32.79 real 32.78 user 0.00 sys
algorithm -fno-builtin:
7.25 real 7.25 user 0.00 sys
algorithm -fno-builtin zq.S:
1.55 real 1.54 user 0.00 sys
%%%
%%%
#! /bin/sh
# This is a shell archive. Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file". To overwrite existing
# files, type "sh file -c". You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g.. If this archive is complete, you
# will see the following message at the end:
# "End of shell archive."
# Contents: z.c zi zq.S
# Wrapped by root at besplex.bde.org on Wed Jan 14 01:05:11 2009
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'z.c' -a "${1}" != "-c" ; then
echo shar: Will not clobber existing file \"'z.c'\"
else
echo shar: Extracting \"'z.c'\" \(463 characters\)
sed "s/^X//" >'z.c' <<'END_OF_FILE'
X#include <stdio.h>
X#include <stdlib.h>
X#include <string.h>
X
X#ifndef LEN
X#define LEN 5
X#endif
X
X#ifndef STRLEN
X#define STRLEN strlen
X#endif
X
Xstatic char *foo[1000];
Xstatic volatile size_t s;
X
X#ifdef MISTRLEN
X#include "/usr/src/lib/libc/string/strlen.c"
X#endif
X
Xint
Xmain(void)
X{
X int i;
X
X for (i = 0; i < 10; i++) {
X foo[i] = malloc(LEN + 1);
X sprintf(foo[i], "%*.*s", LEN, LEN, "foo");
X }
X for (i = 0; i < 10000000; i++)
X s = STRLEN(foo[i % 10]);
X return (0);
X}
END_OF_FILE
if test 463 -ne `wc -c <'z.c'`; then
echo shar: \"'z.c'\" unpacked with wrong size!
fi
# end of 'z.c'
fi
if test -f 'zi' -a "${1}" != "-c" ; then
echo shar: Will not clobber existing file \"'zi'\"
else
echo shar: Extracting \"'zi'\" \(224 characters\)
sed "s/^X//" >'zi' <<'END_OF_FILE'
Xfor len in 0 1 10 100 1000
Xdo
X echo "string length $len:"
X for alg in -DSTRLEN=__builtin_strlen -fno-builtin "-fno-builtin zq.S"
X do
X echo "algorithm $alg:"
X cc -o z z.c -O $alg -g -static -DLEN=$len
X time ./z
X done
Xdone
END_OF_FILE
if test 224 -ne `wc -c <'zi'`; then
echo shar: \"'zi'\" unpacked with wrong size!
fi
# end of 'zi'
fi
if test -f 'zq.S' -a "${1}" != "-c" ; then
echo shar: Will not clobber existing file \"'zq.S'\"
else
echo shar: Extracting \"'zq.S'\" \(4304 characters\)
sed "s/^X//" >'zq.S' <<'END_OF_FILE'
X/*
X * Written by J.T. Conklin <jtc at acorntoolworks.com>
X * Public domain.
X */
X
X#include <machine/asm.h>
X__FBSDID("$FreeBSD$");
X
X#if 0
X RCSID("$NetBSD: strlen.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
X#endif
X
XENTRY(strlen)
X movq %rdi,%rax
X negq %rdi
X
X.Lalign:
X /* Consider unrolling loop? */
X testb $7,%al
X je .Lword_aligned
X cmpb $0,(%rax)
X jne 1f
X leaq (%rdi,%rax),%rax
X ret
X1: incq %rax
X jmp .Lalign
X
X /*
X * There are many well known branch-free sequences which are used
X * for determining whether a zero-byte is contained within a word.
X * These sequences are generally much more efficent than loading
X * and comparing each byte individually.
X *
X * The expression [1,2]:
X *
X * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
X *
X * evaluates to a non-zero value if any of the bytes in the
X * original word is zero.
X *
X * It also has the useful property that bytes in the result word
X * that coorespond to non-zero bytes in the original word have
X * the value 0x00, while bytes cooresponding to zero bytes have
X * the value 0x80. This allows calculation of the first (and
X * last) occurance of a zero byte within the word (useful for C's
X * str* primitives) by counting the number of leading (or
X * trailing) zeros and dividing the result by 8. On machines
X * without (or with slow) clz() / ctz() instructions, testing
X * each byte in the result word for zero is necessary.
X *
X * This typically takes 4 instructions (5 on machines without
X * "not-or") not including those needed to load the constant.
X *
X *
X * The expression:
X *
X * (2) ((x - 0x01....01) & ~x & 0x80....80)
X *
X * evaluates to a non-zero value if any of the bytes in the
X * original word is zero.
X *
X * On little endian machines, the first byte in the result word
X * that cooresponds to a zero byte in the original byte is 0x80,
X * so clz() can be used as above. On big endian machines, and
X * little endian machines without (or with a slow) clz() insn,
X * testing each byte in the original for zero is necessary
X *
X * This typically takes 3 instructions (4 on machines without
X * "and with complement") not including those needed to load
X * constants.
X *
X *
X * The expression:
X *
X * (3) ((x - 0x01....01) & 0x80....80)
X *
X * always evaluates to a non-zero value if any of the bytes in
X * the original word is zero. However, in rare cases, it also
X * evaluates to a non-zero value when none of the bytes in the
X * original word is zero.
X *
X * To account for possible false positives, each byte of the
X * original word must be checked when the expression evaluates to
X * a non-zero value. However, because it is simpler than those
X * presented above, code that uses it will be faster as long as
X * the rate of false positives is low.
X *
X * This is likely, because the the false positive can only occur
X * if the most siginificant bit of a byte within the word is set.
X * The expression will never fail for typical 7-bit ASCII strings.
X *
X * This typically takes 2 instructions not including those needed
X * to load constants.
X *
X *
X * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
X *
X * [2] International Business Machines, "The PowerPC Compiler Writer's
X * Guide", Warthman Associates, 1996
X */
X
X .align 4
X.Lword_aligned:
X movabsq $0x0101010101010101,%r8
X movabsq $0x8080808080808080,%r9
X.Lloop:
X movq (%rax),%rdx
X addq $8,%rax
X subq %r8,%rdx
X testq %r9,%rdx
X je .Lloop
X
X /*
X * In rare cases, the above loop may exit prematurely. We must
X * return to the loop if none of the bytes in the word equal 0.
X */
X cmpb $0,-8(%rax) /* 1st byte == 0? */
X je .Lsub8
X cmpb $0,-7(%rax) /* 2nd byte == 0? */
X je .Lsub7
X cmpb $0,-6(%rax) /* 3rd byte == 0? */
X je .Lsub6
X cmpb $0,-5(%rax) /* 4th byte == 0? */
X je .Lsub5
X cmpb $0,-4(%rax) /* 5th byte == 0? */
X je .Lsub4
X cmpb $0,-3(%rax) /* 6th byte == 0? */
X je .Lsub3
X cmpb $0,-2(%rax) /* 7th byte == 0? */
X je .Lsub2
X cmpb $0,-1(%rax) /* 8th byte == 0? */
X jne .Lloop
X
X.Lsub1:
X leaq -1(%rdi,%rax),%rax
X ret
X.Lsub2:
X leaq -2(%rdi,%rax),%rax
X ret
X.Lsub3:
X leaq -3(%rdi,%rax),%rax
X ret
X.Lsub4:
X leaq -4(%rdi,%rax),%rax
X ret
X.Lsub5:
X leaq -5(%rdi,%rax),%rax
X ret
X.Lsub6:
X leaq -6(%rdi,%rax),%rax
X ret
X.Lsub7:
X leaq -7(%rdi,%rax),%rax
X ret
X.Lsub8:
X leaq -8(%rdi,%rax),%rax
X ret
END_OF_FILE
if test 4304 -ne `wc -c <'zq.S'`; then
echo shar: \"'zq.S'\" unpacked with wrong size!
fi
# end of 'zq.S'
fi
echo shar: End of shell archive.
exit 0
%%%
Bruce
More information about the freebsd-arch
mailing list