bin/96393: assembler implementations for libz on i386
Mikhail Teterin
mi+kde at aldan.algebra.com
Thu Apr 27 04:50:17 UTC 2006
>Number: 96393
>Category: bin
>Synopsis: assembler implementations for libz on i386
>Confidential: no
>Severity: serious
>Priority: medium
>Responsible: freebsd-bugs
>State: open
>Quarter:
>Keywords:
>Date-Required:
>Class: change-request
>Submitter-Id: current-users
>Arrival-Date: Thu Apr 27 04:50:11 GMT 2006
>Closed-Date:
>Last-Modified:
>Originator: Mikhail Teterin
>Release: FreeBSD 6.1-PRERELEASE i386
>Organization:
Virtual Estates, Inc.
>Environment:
>Description:
libz comes with assembler implementations of "hot" functions for
some platforms. x86 is the easiest to port, although someone
absolutely must visit the amd64 assembler code written in MASM.
The speed gains are quite noticable at both compression and
decompression.
>How-To-Repeat:
>Fix:
Below is the patch for Makefile, plus the shar of the three new files taken
from the zlib-1.2.3 tarball (with minor modification). The patch is intended
for true pentiums right now, although Opterons and Athlons in 32-bit mode may
also benefit.
Index: Makefile
===================================================================
RCS file: /meow/ncvs/src/lib/libz/Makefile,v
retrieving revision 1.17
diff -U2 -r1.17 Makefile
--- Makefile 3 Jun 2005 15:25:13 -0000 1.17
+++ Makefile 27 Apr 2006 01:11:47 -0000
@@ -17,7 +17,29 @@
SRCS = adler32.c compress.c crc32.c gzio.c uncompr.c deflate.c trees.c \
- zutil.c inflate.c inftrees.c inffast.c zopen.c infback.c
+ zutil.c inflate.c inftrees.c zopen.c infback.c
INCS= zconf.h zlib.h
+.if ${MACHINE_ARCH} == "i386" && ${CPUTYPE:Mpentium*}
+SRCS+= match.S
+CFLAGS+=-DASMV -DNO_UNDERLINE
+
+.if ${CPUTYPE} == "pentium" || ${CPUTYPE} == "pentium-mmx"
+.PATH: ${.CURDIR}/contrib/asm586
+.else
+.PATH: ${.CURDIR}/contrib/asm686
+.endif
+.endif
+
+.if ${MACHINE_ARCH} == "i386"
+SRCS+= _inffast.S # bug in bsd.*.mk prevents calling this inffast.S
+CFLAGS+=-DASMINF
+.PATH: ${.CURDIR}/contrib/inflate86
+. if ${CPUTYPE} == "pentium-mmx" # The mmx code is only beneficial here
+CFLAGS+=-DUSE_MMX
+. endif
+.else
+SRCS+= inffast.c
+.endif
+
minigzip: all minigzip.o
$(CC) -o minigzip minigzip.o -L. -lz
# This is a shell archive. Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file". Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
# contrib/asm586/match.S
# contrib/asm686/match.S
# contrib/inflate86/_inffast.S
#
echo x - contrib/asm586/match.S
sed 's/^X//' >contrib/asm586/match.S << 'END-of-contrib/asm586/match.S'
X/* match.s -- Pentium-optimized version of longest_match()
X * Written for zlib 1.1.2
X * Copyright (C) 1998 Brian Raiter <breadbox at muppetlabs.com>
X *
X * This is free software; you can redistribute it and/or modify it
X * under the terms of the GNU General Public License.
X */
X
X#ifndef NO_UNDERLINE
X#define match_init _match_init
X#define longest_match _longest_match
X#endif
X
X#define MAX_MATCH (258)
X#define MIN_MATCH (3)
X#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
X#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
X
X/* stack frame offsets */
X
X#define wmask 0 /* local copy of s->wmask
*/
X#define window 4 /* local copy of s->window
*/
X#define windowbestlen 8 /* s->window + bestlen
*/
X#define chainlenscanend 12 /* high word: current chain
len */
X /* low word: last bytes sought */
X#define scanstart 16 /* first two bytes of string
*/
X#define scanalign 20 /* dword-misalignment of
string */
X#define nicematch 24 /* a good enough match size
*/
X#define bestlen 28 /* size of best match so far
*/
X#define scan 32 /* ptr to string wanting match
*/
X
X#define LocalVarsSize (36)
X/* saved ebx 36 */
X/* saved edi 40 */
X/* saved esi 44 */
X/* saved ebp 48 */
X/* return address 52 */
X#define deflatestate 56 /* the function arguments
*/
X#define curmatch 60
X
X/* Offsets for fields in the deflate_state structure. These numbers
X * are calculated from the definition of deflate_state, with the
X * assumption that the compiler will dword-align the fields. (Thus,
X * changing the definition of deflate_state could easily cause this
X * program to crash horribly, without so much as a warning at
X * compile time. Sigh.)
X */
X
X/* All the +zlib1222add offsets are due to the addition of fields
X * in zlib in the deflate_state structure since the asm code was first
written
X * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
X * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ
0").
X * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
X */
X
X#define zlib1222add (0)
X
X#define dsWSize (36+zlib1222add)
X#define dsWMask (44+zlib1222add)
X#define dsWindow (48+zlib1222add)
X#define dsPrev (56+zlib1222add)
X#define dsMatchLen (88+zlib1222add)
X#define dsPrevMatch (92+zlib1222add)
X#define dsStrStart (100+zlib1222add)
X#define dsMatchStart (104+zlib1222add)
X#define dsLookahead (108+zlib1222add)
X#define dsPrevLen (112+zlib1222add)
X#define dsMaxChainLen (116+zlib1222add)
X#define dsGoodMatch (132+zlib1222add)
X#define dsNiceMatch (136+zlib1222add)
X
X
X.file "match.S"
X
X.globl match_init, longest_match
X
X.text
X
X/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
X
Xlongest_match:
X
X/* Save registers that the compiler may be using, and adjust %esp to */
X/* make room for our stack frame. */
X
X pushl %ebp
X pushl %edi
X pushl %esi
X pushl %ebx
X subl $LocalVarsSize, %esp
X
X/* Retrieve the function arguments. %ecx will hold cur_match */
X/* throughout the entire function. %edx will hold the pointer to the */
X/* deflate_state structure during the function's setup (before */
X/* entering the main loop). */
X
X movl deflatestate(%esp), %edx
X movl curmatch(%esp), %ecx
X
X/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
X
X movl dsNiceMatch(%edx), %eax
X movl dsLookahead(%edx), %ebx
X cmpl %eax, %ebx
X jl LookaheadLess
X movl %eax, %ebx
XLookaheadLess: movl %ebx, nicematch(%esp)
X
X/* register Bytef *scan = s->window + s->strstart; */
X
X movl dsWindow(%edx), %esi
X movl %esi, window(%esp)
X movl dsStrStart(%edx), %ebp
X lea (%esi,%ebp), %edi
X movl %edi, scan(%esp)
X
X/* Determine how many bytes the scan ptr is off from being */
X/* dword-aligned. */
X
X movl %edi, %eax
X negl %eax
X andl $3, %eax
X movl %eax, scanalign(%esp)
X
X/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
X/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
X
X movl dsWSize(%edx), %eax
X subl $MIN_LOOKAHEAD, %eax
X subl %eax, %ebp
X jg LimitPositive
X xorl %ebp, %ebp
XLimitPositive:
X
X/* unsigned chain_length = s->max_chain_length;
*/
X/* if (s->prev_length >= s->good_match) { */
X/* chain_length >>= 2; */
X/* } */
X
X movl dsPrevLen(%edx), %eax
X movl dsGoodMatch(%edx), %ebx
X cmpl %ebx, %eax
X movl dsMaxChainLen(%edx), %ebx
X jl LastMatchGood
X shrl $2, %ebx
XLastMatchGood:
X
X/* chainlen is decremented once beforehand so that the function can */
X/* use the sign flag instead of the zero flag for the exit test. */
X/* It is then shifted into the high word, to make room for the scanend */
X/* scanend value, which it will always accompany. */
X
X decl %ebx
X shll $16, %ebx
X
X/* int best_len = s->prev_length; */
X
X movl dsPrevLen(%edx), %eax
X movl %eax, bestlen(%esp)
X
X/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
X
X addl %eax, %esi
X movl %esi, windowbestlen(%esp)
X
X/* register ush scan_start = *(ushf*)scan; */
X/* register ush scan_end = *(ushf*)(scan+best_len-1);
*/
X
X movw (%edi), %bx
X movw %bx, scanstart(%esp)
X movw -1(%edi,%eax), %bx
X movl %ebx, chainlenscanend(%esp)
X
X/* Posf *prev = s->prev; */
X/* uInt wmask = s->w_mask; */
X
X movl dsPrev(%edx), %edi
X movl dsWMask(%edx), %edx
X mov %edx, wmask(%esp)
X
X/* Jump into the main loop. */
X
X jmp LoopEntry
X
X.balign 16
X
X/* do {
X * match = s->window + cur_match;
X * if (*(ushf*)(match+best_len-1) != scan_end ||
X * *(ushf*)match != scan_start) continue;
X * [...]
X * } while ((cur_match = prev[cur_match & wmask]) > limit
X * && --chain_length != 0);
X *
X * Here is the inner loop of the function. The function will spend the
X * majority of its time in this loop, and majority of that time will
X * be spent in the first ten instructions.
X *
X * Within this loop:
X * %ebx = chainlenscanend - i.e., ((chainlen << 16) | scanend)
X * %ecx = curmatch
X * %edx = curmatch & wmask
X * %esi = windowbestlen - i.e., (window + bestlen)
X * %edi = prev
X * %ebp = limit
X *
X * Two optimization notes on the choice of instructions:
X *
X * The first instruction uses a 16-bit address, which costs an extra,
X * unpairable cycle. This is cheaper than doing a 32-bit access and
X * zeroing the high word, due to the 3-cycle misalignment penalty which
X * would occur half the time. This also turns out to be cheaper than
X * doing two separate 8-bit accesses, as the memory is so rarely in the
X * L1 cache.
X *
X * The window buffer, however, apparently spends a lot of time in the
X * cache, and so it is faster to retrieve the word at the end of the
X * match string with two 8-bit loads. The instructions that test the
X * word at the beginning of the match string, however, are executed
X * much less frequently, and there it was cheaper to use 16-bit
X * instructions, which avoided the necessity of saving off and
X * subsequently reloading one of the other registers.
X */
XLookupLoop:
X /* 1 U & V */
X movw (%edi,%edx,2), %cx /* 2 U pipe */
X movl wmask(%esp), %edx /* 2 V pipe */
X cmpl %ebp, %ecx /* 3 U pipe */
X jbe LeaveNow /* 3 V pipe */
X subl $0x00010000, %ebx /* 4 U pipe */
X js LeaveNow /* 4 V pipe */
XLoopEntry: movb -1(%esi,%ecx), %al /* 5 U pipe */
X andl %ecx, %edx /* 5 V pipe */
X cmpb %bl, %al /* 6 U pipe */
X jnz LookupLoop /* 6 V pipe */
X movb (%esi,%ecx), %ah
X cmpb %bh, %ah
X jnz LookupLoop
X movl window(%esp), %eax
X movw (%eax,%ecx), %ax
X cmpw scanstart(%esp), %ax
X jnz LookupLoop
X
X/* Store the current value of chainlen.
*/
X
X movl %ebx, chainlenscanend(%esp)
X
X/* Point %edi to the string under scrutiny, and %esi to the string we */
X/* are hoping to match it up with. In actuality, %esi and %edi are */
X/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
X/* initialized to -(MAX_MATCH_8 - scanalign). */
X
X movl window(%esp), %esi
X movl scan(%esp), %edi
X addl %ecx, %esi
X movl scanalign(%esp), %eax
X movl $(-MAX_MATCH_8), %edx
X lea MAX_MATCH_8(%edi,%eax), %edi
X lea MAX_MATCH_8(%esi,%eax), %esi
X
X/* Test the strings for equality, 8 bytes at a time. At the end,
X * adjust %edx so that it is offset to the exact byte that mismatched.
X *
X * We already know at this point that the first three bytes of the
X * strings match each other, and they can be safely passed over before
X * starting the compare loop. So what this code does is skip over 0-3
X * bytes, as much as necessary in order to dword-align the %edi
X * pointer. (%esi will still be misaligned three times out of four.)
X *
X * It should be confessed that this loop usually does not represent
X * much of the total running time. Replacing it with a more
X * straightforward "rep cmpsb" would not drastically degrade
X * performance.
X */
XLoopCmps:
X movl (%esi,%edx), %eax
X movl (%edi,%edx), %ebx
X xorl %ebx, %eax
X jnz LeaveLoopCmps
X movl 4(%esi,%edx), %eax
X movl 4(%edi,%edx), %ebx
X xorl %ebx, %eax
X jnz LeaveLoopCmps4
X addl $8, %edx
X jnz LoopCmps
X jmp LenMaximum
XLeaveLoopCmps4: addl $4, %edx
XLeaveLoopCmps: testl $0x0000FFFF, %eax
X jnz LenLower
X addl $2, %edx
X shrl $16, %eax
XLenLower: subb $1, %al
X adcl $0, %edx
X
X/* Calculate the length of the match. If it is longer than MAX_MATCH, */
X/* then automatically accept it as the best possible match and leave. */
X
X lea (%edi,%edx), %eax
X movl scan(%esp), %edi
X subl %edi, %eax
X cmpl $MAX_MATCH, %eax
X jge LenMaximum
X
X/* If the length of the match is not longer than the best match we */
X/* have so far, then forget it and return to the lookup loop. */
X
X movl deflatestate(%esp), %edx
X movl bestlen(%esp), %ebx
X cmpl %ebx, %eax
X jg LongerMatch
X movl chainlenscanend(%esp), %ebx
X movl windowbestlen(%esp), %esi
X movl dsPrev(%edx), %edi
X movl wmask(%esp), %edx
X andl %ecx, %edx
X jmp LookupLoop
X
X/* s->match_start = cur_match; */
X/* best_len = len; */
X/* if (len >= nice_match) break; */
X/* scan_end = *(ushf*)(scan+best_len-1); */
X
XLongerMatch: movl nicematch(%esp), %ebx
X movl %eax, bestlen(%esp)
X movl %ecx, dsMatchStart(%edx)
X cmpl %ebx, %eax
X jge LeaveNow
X movl window(%esp), %esi
X addl %eax, %esi
X movl %esi, windowbestlen(%esp)
X movl chainlenscanend(%esp), %ebx
X movw -1(%edi,%eax), %bx
X movl dsPrev(%edx), %edi
X movl %ebx, chainlenscanend(%esp)
X movl wmask(%esp), %edx
X andl %ecx, %edx
X jmp LookupLoop
X
X/* Accept the current string, with the maximum possible length.
*/
X
XLenMaximum: movl deflatestate(%esp), %edx
X movl $MAX_MATCH, bestlen(%esp)
X movl %ecx, dsMatchStart(%edx)
X
X/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
X/* return s->lookahead;
*/
X
XLeaveNow:
X movl deflatestate(%esp), %edx
X movl bestlen(%esp), %ebx
X movl dsLookahead(%edx), %eax
X cmpl %eax, %ebx
X jg LookaheadRet
X movl %ebx, %eax
XLookaheadRet:
X
X/* Restore the stack and return from whence we came. */
X
X addl $LocalVarsSize, %esp
X popl %ebx
X popl %esi
X popl %edi
X popl %ebp
Xmatch_init: ret
END-of-contrib/asm586/match.S
echo x - contrib/asm686/match.S
sed 's/^X//' >contrib/asm686/match.S << 'END-of-contrib/asm686/match.S'
X/* match.s -- Pentium-Pro-optimized version of longest_match()
X * Written for zlib 1.1.2
X * Copyright (C) 1998 Brian Raiter <breadbox at muppetlabs.com>
X *
X * This is free software; you can redistribute it and/or modify it
X * under the terms of the GNU General Public License.
X */
X
X#ifndef NO_UNDERLINE
X#define match_init _match_init
X#define longest_match _longest_match
X#endif
X
X#define MAX_MATCH (258)
X#define MIN_MATCH (3)
X#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
X#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
X
X/* stack frame offsets */
X
X#define chainlenwmask 0 /* high word: current chain
len */
X /* low word: s->wmask */
X#define window 4 /* local copy of s->window
*/
X#define windowbestlen 8 /* s->window + bestlen
*/
X#define scanstart 16 /* first two bytes of string
*/
X#define scanend 12 /* last two bytes of string
*/
X#define scanalign 20 /* dword-misalignment of
string */
X#define nicematch 24 /* a good enough match size
*/
X#define bestlen 28 /* size of best match so far
*/
X#define scan 32 /* ptr to string wanting match
*/
X
X#define LocalVarsSize (36)
X/* saved ebx 36 */
X/* saved edi 40 */
X/* saved esi 44 */
X/* saved ebp 48 */
X/* return address 52 */
X#define deflatestate 56 /* the function arguments
*/
X#define curmatch 60
X
X/* All the +zlib1222add offsets are due to the addition of fields
X * in zlib in the deflate_state structure since the asm code was first
written
X * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
X * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ
0").
X * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
X */
X
X#define zlib1222add (0)
X
X#define dsWSize (36+zlib1222add)
X#define dsWMask (44+zlib1222add)
X#define dsWindow (48+zlib1222add)
X#define dsPrev (56+zlib1222add)
X#define dsMatchLen (88+zlib1222add)
X#define dsPrevMatch (92+zlib1222add)
X#define dsStrStart (100+zlib1222add)
X#define dsMatchStart (104+zlib1222add)
X#define dsLookahead (108+zlib1222add)
X#define dsPrevLen (112+zlib1222add)
X#define dsMaxChainLen (116+zlib1222add)
X#define dsGoodMatch (132+zlib1222add)
X#define dsNiceMatch (136+zlib1222add)
X
X
X.file "match.S"
X
X.globl match_init, longest_match
X
X.text
X
X/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
X
Xlongest_match:
X
X/* Save registers that the compiler may be using, and adjust %esp to */
X/* make room for our stack frame. */
X
X pushl %ebp
X pushl %edi
X pushl %esi
X pushl %ebx
X subl $LocalVarsSize, %esp
X
X/* Retrieve the function arguments. %ecx will hold cur_match */
X/* throughout the entire function. %edx will hold the pointer to the */
X/* deflate_state structure during the function's setup (before */
X/* entering the main loop). */
X
X movl deflatestate(%esp), %edx
X movl curmatch(%esp), %ecx
X
X/* uInt wmask = s->w_mask; */
X/* unsigned chain_length = s->max_chain_length;
*/
X/* if (s->prev_length >= s->good_match) { */
X/* chain_length >>= 2; */
X/* } */
X
X movl dsPrevLen(%edx), %eax
X movl dsGoodMatch(%edx), %ebx
X cmpl %ebx, %eax
X movl dsWMask(%edx), %eax
X movl dsMaxChainLen(%edx), %ebx
X jl LastMatchGood
X shrl $2, %ebx
XLastMatchGood:
X
X/* chainlen is decremented once beforehand so that the function can */
X/* use the sign flag instead of the zero flag for the exit test. */
X/* It is then shifted into the high word, to make room for the wmask */
X/* value, which it will always accompany. */
X
X decl %ebx
X shll $16, %ebx
X orl %eax, %ebx
X movl %ebx, chainlenwmask(%esp)
X
X/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
X
X movl dsNiceMatch(%edx), %eax
X movl dsLookahead(%edx), %ebx
X cmpl %eax, %ebx
X jl LookaheadLess
X movl %eax, %ebx
XLookaheadLess: movl %ebx, nicematch(%esp)
X
X/* register Bytef *scan = s->window + s->strstart; */
X
X movl dsWindow(%edx), %esi
X movl %esi, window(%esp)
X movl dsStrStart(%edx), %ebp
X lea (%esi,%ebp), %edi
X movl %edi, scan(%esp)
X
X/* Determine how many bytes the scan ptr is off from being */
X/* dword-aligned. */
X
X movl %edi, %eax
X negl %eax
X andl $3, %eax
X movl %eax, scanalign(%esp)
X
X/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
X/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
X
X movl dsWSize(%edx), %eax
X subl $MIN_LOOKAHEAD, %eax
X subl %eax, %ebp
X jg LimitPositive
X xorl %ebp, %ebp
XLimitPositive:
X
X/* int best_len = s->prev_length; */
X
X movl dsPrevLen(%edx), %eax
X movl %eax, bestlen(%esp)
X
X/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
X
X addl %eax, %esi
X movl %esi, windowbestlen(%esp)
X
X/* register ush scan_start = *(ushf*)scan; */
X/* register ush scan_end = *(ushf*)(scan+best_len-1);
*/
X/* Posf *prev = s->prev; */
X
X movzwl (%edi), %ebx
X movl %ebx, scanstart(%esp)
X movzwl -1(%edi,%eax), %ebx
X movl %ebx, scanend(%esp)
X movl dsPrev(%edx), %edi
X
X/* Jump into the main loop. */
X
X movl chainlenwmask(%esp), %edx
X jmp LoopEntry
X
X.balign 16
X
X/* do {
X * match = s->window + cur_match;
X * if (*(ushf*)(match+best_len-1) != scan_end ||
X * *(ushf*)match != scan_start) continue;
X * [...]
X * } while ((cur_match = prev[cur_match & wmask]) > limit
X * && --chain_length != 0);
X *
X * Here is the inner loop of the function. The function will spend the
X * majority of its time in this loop, and majority of that time will
X * be spent in the first ten instructions.
X *
X * Within this loop:
X * %ebx = scanend
X * %ecx = curmatch
X * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
X * %esi = windowbestlen - i.e., (window + bestlen)
X * %edi = prev
X * %ebp = limit
X */
XLookupLoop:
X andl %edx, %ecx
X movzwl (%edi,%ecx,2), %ecx
X cmpl %ebp, %ecx
X jbe LeaveNow
X subl $0x00010000, %edx
X js LeaveNow
XLoopEntry: movzwl -1(%esi,%ecx), %eax
X cmpl %ebx, %eax
X jnz LookupLoop
X movl window(%esp), %eax
X movzwl (%eax,%ecx), %eax
X cmpl scanstart(%esp), %eax
X jnz LookupLoop
X
X/* Store the current value of chainlen.
*/
X
X movl %edx, chainlenwmask(%esp)
X
X/* Point %edi to the string under scrutiny, and %esi to the string we */
X/* are hoping to match it up with. In actuality, %esi and %edi are */
X/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
X/* initialized to -(MAX_MATCH_8 - scanalign). */
X
X movl window(%esp), %esi
X movl scan(%esp), %edi
X addl %ecx, %esi
X movl scanalign(%esp), %eax
X movl $(-MAX_MATCH_8), %edx
X lea MAX_MATCH_8(%edi,%eax), %edi
X lea MAX_MATCH_8(%esi,%eax), %esi
X
X/* Test the strings for equality, 8 bytes at a time. At the end,
X * adjust %edx so that it is offset to the exact byte that mismatched.
X *
X * We already know at this point that the first three bytes of the
X * strings match each other, and they can be safely passed over before
X * starting the compare loop. So what this code does is skip over 0-3
X * bytes, as much as necessary in order to dword-align the %edi
X * pointer. (%esi will still be misaligned three times out of four.)
X *
X * It should be confessed that this loop usually does not represent
X * much of the total running time. Replacing it with a more
X * straightforward "rep cmpsb" would not drastically degrade
X * performance.
X */
XLoopCmps:
X movl (%esi,%edx), %eax
X xorl (%edi,%edx), %eax
X jnz LeaveLoopCmps
X movl 4(%esi,%edx), %eax
X xorl 4(%edi,%edx), %eax
X jnz LeaveLoopCmps4
X addl $8, %edx
X jnz LoopCmps
X jmp LenMaximum
XLeaveLoopCmps4: addl $4, %edx
XLeaveLoopCmps: testl $0x0000FFFF, %eax
X jnz LenLower
X addl $2, %edx
X shrl $16, %eax
XLenLower: subb $1, %al
X adcl $0, %edx
X
X/* Calculate the length of the match. If it is longer than MAX_MATCH, */
X/* then automatically accept it as the best possible match and leave. */
X
X lea (%edi,%edx), %eax
X movl scan(%esp), %edi
X subl %edi, %eax
X cmpl $MAX_MATCH, %eax
X jge LenMaximum
X
X/* If the length of the match is not longer than the best match we */
X/* have so far, then forget it and return to the lookup loop. */
X
X movl deflatestate(%esp), %edx
X movl bestlen(%esp), %ebx
X cmpl %ebx, %eax
X jg LongerMatch
X movl windowbestlen(%esp), %esi
X movl dsPrev(%edx), %edi
X movl scanend(%esp), %ebx
X movl chainlenwmask(%esp), %edx
X jmp LookupLoop
X
X/* s->match_start = cur_match; */
X/* best_len = len; */
X/* if (len >= nice_match) break; */
X/* scan_end = *(ushf*)(scan+best_len-1); */
X
XLongerMatch: movl nicematch(%esp), %ebx
X movl %eax, bestlen(%esp)
X movl %ecx, dsMatchStart(%edx)
X cmpl %ebx, %eax
X jge LeaveNow
X movl window(%esp), %esi
X addl %eax, %esi
X movl %esi, windowbestlen(%esp)
X movzwl -1(%edi,%eax), %ebx
X movl dsPrev(%edx), %edi
X movl %ebx, scanend(%esp)
X movl chainlenwmask(%esp), %edx
X jmp LookupLoop
X
X/* Accept the current string, with the maximum possible length.
*/
X
XLenMaximum: movl deflatestate(%esp), %edx
X movl $MAX_MATCH, bestlen(%esp)
X movl %ecx, dsMatchStart(%edx)
X
X/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
X/* return s->lookahead;
*/
X
XLeaveNow:
X movl deflatestate(%esp), %edx
X movl bestlen(%esp), %ebx
X movl dsLookahead(%edx), %eax
X cmpl %eax, %ebx
X jg LookaheadRet
X movl %ebx, %eax
XLookaheadRet:
X
X/* Restore the stack and return from whence we came. */
X
X addl $LocalVarsSize, %esp
X popl %ebx
X popl %esi
X popl %edi
X popl %ebp
Xmatch_init: ret
END-of-contrib/asm686/match.S
echo x - contrib/inflate86/_inffast.S
sed 's/^X//' >contrib/inflate86/_inffast.S
<< 'END-of-contrib/inflate86/_inffast.S'
X/*
X * inffast.S is a hand tuned assembler version of:
X *
X * inffast.c -- fast decoding
X * Copyright (C) 1995-2003 Mark Adler
X * For conditions of distribution and use, see copyright notice in zlib.h
X *
X * Copyright (C) 2003 Chris Anderson <christop at charm.net>
X * Please use the copyright conditions above.
X *
X * This version (Jan-23-2003) of inflate_fast was coded and tested under
X * GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution. On
that
X * machine, I found that gzip style archives decompressed about 20% faster
than
X * the gcc-3.2 -O3 -fomit-frame-pointer compiled version. Your results will
X * depend on how large of a buffer is used for z_stream.next_in & next_out
X * (8K-32K worked best for my 256K cpu cache) and how much overhead there is
in
X * stream processing I/O and crc32/addler32. In my case, this routine used
X * 70% of the cpu time and crc32 used 20%.
X *
X * I am confident that this version will work in the general case, but I have
X * not tested a wide variety of datasets or a wide variety of platforms.
X *
X * Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating.
X * It should be a runtime flag instead of compile time flag...
X *
X * Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction.
X * With -DUSE_MMX, only MMX code is compiled. With -DNO_MMX, only non-MMX
code
X * is compiled. Without either option, runtime detection is enabled.
Runtime
X * detection should work on all modern cpus and the recomended algorithm
(flip
X * ID bit on eflags and then use the cpuid instruction) is used in many
X * multimedia applications. Tested under win2k with gcc-2.95 and gas-2.12
X * distributed with cygwin3. Compiling with gcc-2.95 -c inffast.S -o
X * inffast.obj generates a COFF object which can then be linked with MSVC++
X * compiled code. Tested under FreeBSD 4.7 with gcc-2.95.
X *
X * Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and
X * slower than compiler generated code). Adjusted cpuid check to use the MMX
X * code only for Pentiums < P4 until I have more data on the P4. Speed
X * improvment is only about 15% on the Athlon when compared with code
generated
X * with MSVC++. Not sure yet, but I think the P4 will also be slower using
the
X * MMX mode because many of it's x86 ALU instructions execute in .5 cycles
and
X * have less latency than MMX ops. Added code to buffer the last 11 bytes of
X * the input stream since the MMX code grabs bits in chunks of 32, which
X * differs from the inffast.c algorithm. I don't think there would have been
X * read overruns where a page boundary was crossed (a segfault), but there
X * could have been overruns when next_in ends on unaligned memory
(unintialized
X * memory read).
X *
X * Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX. I created a C
X * version of the non-MMX code so that it doesn't depend on zstrm and zstate
X * structure offsets which are hard coded in this file. This was last tested
X * with zlib-1.2.0 which is currently in beta testing, newer versions of this
X * and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and
X * http://www.charm.net/~christop/zlib/
X */
X
X
X/*
X * if you have underscore linking problems (_inflate_fast undefined), try
X * using -DGAS_COFF
X */
X#if ! defined( GAS_COFF ) && ! defined( GAS_ELF )
X
X#if defined( WIN32 ) || defined( __CYGWIN__ )
X#define GAS_COFF /* windows object format */
X#else
X#define GAS_ELF
X#endif
X
X#endif /* ! GAS_COFF && ! GAS_ELF */
X
X
X#if defined( GAS_COFF )
X
X/* coff externals have underscores */
X#define inflate_fast _inflate_fast
X#define inflate_fast_use_mmx _inflate_fast_use_mmx
X
X#endif /* GAS_COFF */
X
X
X.file "inffast.S"
X
X.globl inflate_fast
X
X.text
X.align 4,0
X.L_invalid_literal_length_code_msg:
X.string "invalid literal/length code"
X
X.align 4,0
X.L_invalid_distance_code_msg:
X.string "invalid distance code"
X
X.align 4,0
X.L_invalid_distance_too_far_msg:
X.string "invalid distance too far back"
X
X#if ! defined( NO_MMX )
X.align 4,0
X.L_mask: /* mask[N] = ( 1 << N ) - 1 */
X.long 0
X.long 1
X.long 3
X.long 7
X.long 15
X.long 31
X.long 63
X.long 127
X.long 255
X.long 511
X.long 1023
X.long 2047
X.long 4095
X.long 8191
X.long 16383
X.long 32767
X.long 65535
X.long 131071
X.long 262143
X.long 524287
X.long 1048575
X.long 2097151
X.long 4194303
X.long 8388607
X.long 16777215
X.long 33554431
X.long 67108863
X.long 134217727
X.long 268435455
X.long 536870911
X.long 1073741823
X.long 2147483647
X.long 4294967295
X#endif /* NO_MMX */
X
X.text
X
X/*
X * struct z_stream offsets, in zlib.h
X */
X#define next_in_strm 0 /* strm->next_in */
X#define avail_in_strm 4 /* strm->avail_in */
X#define next_out_strm 12 /* strm->next_out */
X#define avail_out_strm 16 /* strm->avail_out */
X#define msg_strm 24 /* strm->msg */
X#define state_strm 28 /* strm->state */
X
X/*
X * struct inflate_state offsets, in inflate.h
X */
X#define mode_state 0 /* state->mode */
X#define wsize_state 32 /* state->wsize */
X#define write_state 40 /* state->write */
X#define window_state 44 /* state->window */
X#define hold_state 48 /* state->hold */
X#define bits_state 52 /* state->bits */
X#define lencode_state 68 /* state->lencode */
X#define distcode_state 72 /* state->distcode */
X#define lenbits_state 76 /* state->lenbits */
X#define distbits_state 80 /* state->distbits */
X
X/*
X * inflate_fast's activation record
X */
X#define local_var_size 64 /* how much local space for vars */
X#define strm_sp 88 /* first arg: z_stream * (local_var_size + 24) */
X#define start_sp 92 /* second arg: unsigned int (local_var_size + 28)
*/
X
X/*
X * offsets for local vars on stack
X */
X#define out 60 /* unsigned char* */
X#define window 56 /* unsigned char* */
X#define wsize 52 /* unsigned int */
X#define write 48 /* unsigned int */
X#define in 44 /* unsigned char* */
X#define beg 40 /* unsigned char* */
X#define buf 28 /* char[ 12 ] */
X#define len 24 /* unsigned int */
X#define last 20 /* unsigned char* */
X#define end 16 /* unsigned char* */
X#define dcode 12 /* code* */
X#define lcode 8 /* code* */
X#define dmask 4 /* unsigned int */
X#define lmask 0 /* unsigned int */
X
X/*
X * typedef enum inflate_mode consts, in inflate.h
X */
X#define INFLATE_MODE_TYPE 11 /* state->mode flags enum-ed in inflate.h */
X#define INFLATE_MODE_BAD 26
X
X
X#if ! defined( USE_MMX ) && ! defined( NO_MMX )
X
X#define RUN_TIME_MMX
X
X#define CHECK_MMX 1
X#define DO_USE_MMX 2
X#define DONT_USE_MMX 3
X
X.globl inflate_fast_use_mmx
X
X.data
X
X.align 4,0
Xinflate_fast_use_mmx: /* integer flag for run time control 1=check,2=mmx,3=no
*/
X.long CHECK_MMX
X
X#if defined( GAS_ELF )
X/* elf info */
X.type inflate_fast_use_mmx, at object
X.size inflate_fast_use_mmx,4
X#endif
X
X#endif /* RUN_TIME_MMX */
X
X#if defined( GAS_COFF )
X/* coff info: scl 2 = extern, type 32 = function */
X.def inflate_fast; .scl 2; .type 32; .endef
X#endif
X
X.text
X
X.align 32,0x90
Xinflate_fast:
X pushl %edi
X pushl %esi
X pushl %ebp
X pushl %ebx
X pushf /* save eflags (strm_sp, state_sp assumes this is 32 bits) */
X subl $local_var_size, %esp
X cld
X
X#define strm_r %esi
X#define state_r %edi
X
X movl strm_sp(%esp), strm_r
X movl state_strm(strm_r), state_r
X
X /* in = strm->next_in;
X * out = strm->next_out;
X * last = in + strm->avail_in - 11;
X * beg = out - (start - strm->avail_out);
X * end = out + (strm->avail_out - 257);
X */
X movl avail_in_strm(strm_r), %edx
X movl next_in_strm(strm_r), %eax
X
X addl %eax, %edx /* avail_in += next_in */
X subl $11, %edx /* avail_in -= 11 */
X
X movl %eax, in(%esp)
X movl %edx, last(%esp)
X
X movl start_sp(%esp), %ebp
X movl avail_out_strm(strm_r), %ecx
X movl next_out_strm(strm_r), %ebx
X
X subl %ecx, %ebp /* start -= avail_out */
X negl %ebp /* start = -start */
X addl %ebx, %ebp /* start += next_out */
X
X subl $257, %ecx /* avail_out -= 257 */
X addl %ebx, %ecx /* avail_out += out */
X
X movl %ebx, out(%esp)
X movl %ebp, beg(%esp)
X movl %ecx, end(%esp)
X
X /* wsize = state->wsize;
X * write = state->write;
X * window = state->window;
X * hold = state->hold;
X * bits = state->bits;
X * lcode = state->lencode;
X * dcode = state->distcode;
X * lmask = ( 1 << state->lenbits ) - 1;
X * dmask = ( 1 << state->distbits ) - 1;
X */
X
X movl lencode_state(state_r), %eax
X movl distcode_state(state_r), %ecx
X
X movl %eax, lcode(%esp)
X movl %ecx, dcode(%esp)
X
X movl $1, %eax
X movl lenbits_state(state_r), %ecx
X shll %cl, %eax
X decl %eax
X movl %eax, lmask(%esp)
X
X movl $1, %eax
X movl distbits_state(state_r), %ecx
X shll %cl, %eax
X decl %eax
X movl %eax, dmask(%esp)
X
X movl wsize_state(state_r), %eax
X movl write_state(state_r), %ecx
X movl window_state(state_r), %edx
X
X movl %eax, wsize(%esp)
X movl %ecx, write(%esp)
X movl %edx, window(%esp)
X
X movl hold_state(state_r), %ebp
X movl bits_state(state_r), %ebx
X
X#undef strm_r
X#undef state_r
X
X#define in_r %esi
X#define from_r %esi
X#define out_r %edi
X
X movl in(%esp), in_r
X movl last(%esp), %ecx
X cmpl in_r, %ecx
X ja .L_align_long /* if in < last */
X
X addl $11, %ecx /* ecx = &in[ avail_in ] */
X subl in_r, %ecx /* ecx = avail_in */
X movl $12, %eax
X subl %ecx, %eax /* eax = 12 - avail_in */
X leal buf(%esp), %edi
X rep movsb /* memcpy( buf, in, avail_in ) */
X movl %eax, %ecx
X xorl %eax, %eax
X rep stosb /* memset( &buf[ avail_in ], 0, 12 - avail_in )
*/
X leal buf(%esp), in_r /* in = buf */
X movl in_r, last(%esp) /* last = in, do just one iteration
*/
X jmp .L_is_aligned
X
X /* align in_r on long boundary */
X.L_align_long:
X testl $3, in_r
X jz .L_is_aligned
X xorl %eax, %eax
X movb (in_r), %al
X incl in_r
X movl %ebx, %ecx
X addl $8, %ebx
X shll %cl, %eax
X orl %eax, %ebp
X jmp .L_align_long
X
X.L_is_aligned:
X movl out(%esp), out_r
X
X#if defined( NO_MMX )
X jmp .L_do_loop
X#endif
X
X#if defined( USE_MMX )
X jmp .L_init_mmx
X#endif
X
X/*** Runtime MMX check ***/
X
X#if defined( RUN_TIME_MMX )
X.L_check_mmx:
X cmpl $DO_USE_MMX, inflate_fast_use_mmx
X je .L_init_mmx
X ja .L_do_loop /* > 2 */
X
X pushl %eax
X pushl %ebx
X pushl %ecx
X pushl %edx
X pushf
X movl (%esp), %eax /* copy eflags to eax */
X xorl $0x200000, (%esp) /* try toggling ID bit of eflags (bit 21)
X * to see if cpu supports cpuid...
X * ID bit method not supported by NexGen
but
X * bios may load a cpuid instruction and
X * cpuid may be disabled on Cyrix 5-6x86 */
X popf
X pushf
X popl %edx /* copy new eflags to edx */
X xorl %eax, %edx /* test if ID bit is flipped */
X jz .L_dont_use_mmx /* not flipped if zero */
X xorl %eax, %eax
X cpuid
X cmpl $0x756e6547, %ebx /* check for GenuineIntel in ebx,ecx,edx */
X jne .L_dont_use_mmx
X cmpl $0x6c65746e, %ecx
X jne .L_dont_use_mmx
X cmpl $0x49656e69, %edx
X jne .L_dont_use_mmx
X movl $1, %eax
X cpuid /* get cpu features */
X shrl $8, %eax
X andl $15, %eax
X cmpl $6, %eax /* check for Pentium family, is 0xf for P4
*/
X jne .L_dont_use_mmx
X testl $0x800000, %edx /* test if MMX feature is set (bit 23) */
X jnz .L_use_mmx
X jmp .L_dont_use_mmx
X.L_use_mmx:
X movl $DO_USE_MMX, inflate_fast_use_mmx
X jmp .L_check_mmx_pop
X.L_dont_use_mmx:
X movl $DONT_USE_MMX, inflate_fast_use_mmx
X.L_check_mmx_pop:
X popl %edx
X popl %ecx
X popl %ebx
X popl %eax
X jmp .L_check_mmx
X#endif
X
X
X/*** Non-MMX code ***/
X
X#if defined ( NO_MMX ) || defined( RUN_TIME_MMX )
X
X#define hold_r %ebp
X#define bits_r %bl
X#define bitslong_r %ebx
X
X.align 32,0x90
X.L_while_test:
X /* while (in < last && out < end)
X */
X cmpl out_r, end(%esp)
X jbe .L_break_loop /* if (out >= end) */
X
X cmpl in_r, last(%esp)
X jbe .L_break_loop
X
X.L_do_loop:
X /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
X *
X * do {
X * if (bits < 15) {
X * hold |= *((unsigned short *)in)++ << bits;
X * bits += 16
X * }
X * this = lcode[hold & lmask]
X */
X cmpb $15, bits_r
X ja .L_get_length_code /* if (15 < bits) */
X
X xorl %eax, %eax
X lodsw /* al = *(ushort *)in++ */
X movb bits_r, %cl /* cl = bits, needs it for shifting
*/
X addb $16, bits_r /* bits += 16 */
X shll %cl, %eax
X orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits
*/
X
X.L_get_length_code:
X movl lmask(%esp), %edx /* edx = lmask */
X movl lcode(%esp), %ecx /* ecx = lcode */
X andl hold_r, %edx /* edx &= hold */
X movl (%ecx,%edx,4), %eax /* eax = lcode[hold & lmask] */
X
X.L_dolen:
X /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
X *
X * dolen:
X * bits -= this.bits;
X * hold >>= this.bits
X */
X movb %ah, %cl /* cl = this.bits */
X subb %ah, bits_r /* bits -= this.bits */
X shrl %cl, hold_r /* hold >>= this.bits */
X
X /* check if op is a literal
X * if (op == 0) {
X * PUP(out) = this.val;
X * }
X */
X testb %al, %al
X jnz .L_test_for_length_base /* if (op != 0) 45.7% */
X
X shrl $16, %eax /* output this.val char */
X stosb
X jmp .L_while_test
X
X.L_test_for_length_base:
X /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = len
X *
X * else if (op & 16) {
X * len = this.val
X * op &= 15
X * if (op) {
X * if (op > bits) {
X * hold |= *((unsigned short *)in)++ << bits;
X * bits += 16
X * }
X * len += hold & mask[op];
X * bits -= op;
X * hold >>= op;
X * }
X */
X#define len_r %edx
X movl %eax, len_r /* len = this */
X shrl $16, len_r /* len = this.val */
X movb %al, %cl
X
X testb $16, %al
X jz .L_test_for_second_level_length /* if ((op & 16) == 0) 8% */
X andb $15, %cl /* op &= 15 */
X jz .L_save_len /* if (!op) */
X cmpb %cl, bits_r
X jae .L_add_bits_to_len /* if (op <= bits) */
X
X movb %cl, %ch /* stash op in ch, freeing cl */
X xorl %eax, %eax
X lodsw /* al = *(ushort *)in++ */
X movb bits_r, %cl /* cl = bits, needs it for shifting
*/
X addb $16, bits_r /* bits += 16 */
X shll %cl, %eax
X orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits
*/
X movb %ch, %cl /* move op back to ecx */
X
X.L_add_bits_to_len:
X movl $1, %eax
X shll %cl, %eax
X decl %eax
X subb %cl, bits_r
X andl hold_r, %eax /* eax &= hold */
X shrl %cl, hold_r
X addl %eax, len_r /* len += hold & mask[op] */
X
X.L_save_len:
X movl len_r, len(%esp) /* save len */
X#undef len_r
X
X.L_decode_distance:
X /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = dist
X *
X * if (bits < 15) {
X * hold |= *((unsigned short *)in)++ << bits;
X * bits += 16
X * }
X * this = dcode[hold & dmask];
X * dodist:
X * bits -= this.bits;
X * hold >>= this.bits;
X * op = this.op;
X */
X
X cmpb $15, bits_r
X ja .L_get_distance_code /* if (15 < bits) */
X
X xorl %eax, %eax
X lodsw /* al = *(ushort *)in++ */
X movb bits_r, %cl /* cl = bits, needs it for shifting
*/
X addb $16, bits_r /* bits += 16 */
X shll %cl, %eax
X orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits
*/
X
X.L_get_distance_code:
X movl dmask(%esp), %edx /* edx = dmask */
X movl dcode(%esp), %ecx /* ecx = dcode */
X andl hold_r, %edx /* edx &= hold */
X movl (%ecx,%edx,4), %eax /* eax = dcode[hold & dmask] */
X
X#define dist_r %edx
X.L_dodist:
X movl %eax, dist_r /* dist = this */
X shrl $16, dist_r /* dist = this.val */
X movb %ah, %cl
X subb %ah, bits_r /* bits -= this.bits */
X shrl %cl, hold_r /* hold >>= this.bits */
X
X /* if (op & 16) {
X * dist = this.val
X * op &= 15
X * if (op > bits) {
X * hold |= *((unsigned short *)in)++ << bits;
X * bits += 16
X * }
X * dist += hold & mask[op];
X * bits -= op;
X * hold >>= op;
X */
X movb %al, %cl /* cl = this.op */
X
X testb $16, %al /* if ((op & 16) == 0) */
X jz .L_test_for_second_level_dist
X andb $15, %cl /* op &= 15 */
X jz .L_check_dist_one
X cmpb %cl, bits_r
X jae .L_add_bits_to_dist /* if (op <= bits) 97.6% */
X
X movb %cl, %ch /* stash op in ch, freeing cl */
X xorl %eax, %eax
X lodsw /* al = *(ushort *)in++ */
X movb bits_r, %cl /* cl = bits, needs it for shifting
*/
X addb $16, bits_r /* bits += 16 */
X shll %cl, %eax
X orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits
*/
X movb %ch, %cl /* move op back to ecx */
X
X.L_add_bits_to_dist:
X movl $1, %eax
X shll %cl, %eax
X decl %eax /* (1 << op) - 1 */
X subb %cl, bits_r
X andl hold_r, %eax /* eax &= hold */
X shrl %cl, hold_r
X addl %eax, dist_r /* dist += hold & ((1 << op) - 1) */
X jmp .L_check_window
X
X.L_check_window:
X /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx =
dist
X * %ecx = nbytes
X *
X * nbytes = out - beg;
X * if (dist <= nbytes) {
X * from = out - dist;
X * do {
X * PUP(out) = PUP(from);
X * } while (--len > 0) {
X * }
X */
X
X movl in_r, in(%esp) /* save in so from can use it's reg
*/
X movl out_r, %eax
X subl beg(%esp), %eax /* nbytes = out - beg */
X
X cmpl dist_r, %eax
X jb .L_clip_window /* if (dist > nbytes) 4.2% */
X
X movl len(%esp), %ecx
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X
X subl $3, %ecx
X movb (from_r), %al
X movb %al, (out_r)
X movb 1(from_r), %al
X movb 2(from_r), %dl
X addl $3, from_r
X movb %al, 1(out_r)
X movb %dl, 2(out_r)
X addl $3, out_r
X rep movsb
X
X movl in(%esp), in_r /* move in back to %esi, toss from */
X jmp .L_while_test
X
X.align 16,0x90
X.L_check_dist_one:
X cmpl $1, dist_r
X jne .L_check_window
X cmpl out_r, beg(%esp)
X je .L_check_window
X
X decl out_r
X movl len(%esp), %ecx
X movb (out_r), %al
X subl $3, %ecx
X
X movb %al, 1(out_r)
X movb %al, 2(out_r)
X movb %al, 3(out_r)
X addl $4, out_r
X rep stosb
X
X jmp .L_while_test
X
X.align 16,0x90
X.L_test_for_second_level_length:
X /* else if ((op & 64) == 0) {
X * this = lcode[this.val + (hold & mask[op])];
X * }
X */
X testb $64, %al
X jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
X
X movl $1, %eax
X shll %cl, %eax
X decl %eax
X andl hold_r, %eax /* eax &= hold */
X addl %edx, %eax /* eax += this.val */
X movl lcode(%esp), %edx /* edx = lcode */
X movl (%edx,%eax,4), %eax /* eax = lcode[val + (hold&mask[op])]
*/
X jmp .L_dolen
X
X.align 16,0x90
X.L_test_for_second_level_dist:
X /* else if ((op & 64) == 0) {
X * this = dcode[this.val + (hold & mask[op])];
X * }
X */
X testb $64, %al
X jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
X
X movl $1, %eax
X shll %cl, %eax
X decl %eax
X andl hold_r, %eax /* eax &= hold */
X addl %edx, %eax /* eax += this.val */
X movl dcode(%esp), %edx /* edx = dcode */
X movl (%edx,%eax,4), %eax /* eax = dcode[val + (hold&mask[op])]
*/
X jmp .L_dodist
X
X.align 16,0x90
X.L_clip_window:
X /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx =
dist
X * %ecx = nbytes
X *
X * else {
X * if (dist > wsize) {
X * invalid distance
X * }
X * from = window;
X * nbytes = dist - nbytes;
X * if (write == 0) {
X * from += wsize - nbytes;
X */
X#define nbytes_r %ecx
X movl %eax, nbytes_r
X movl wsize(%esp), %eax /* prepare for dist compare */
X negl nbytes_r /* nbytes = -nbytes */
X movl window(%esp), from_r /* from = window */
X
X cmpl dist_r, %eax
X jb .L_invalid_distance_too_far /* if (dist > wsize) */
X
X addl dist_r, nbytes_r /* nbytes = dist - nbytes */
X cmpl $0, write(%esp)
X jne .L_wrap_around_window /* if (write != 0) */
X
X subl nbytes_r, %eax
X addl %eax, from_r /* from += wsize - nbytes */
X
X /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx =
dist
X * %ecx = nbytes, %eax = len
X *
X * if (nbytes < len) {
X * len -= nbytes;
X * do {
X * PUP(out) = PUP(from);
X * } while (--nbytes);
X * from = out - dist;
X * }
X * }
X */
X#define len_r %eax
X movl len(%esp), len_r
X cmpl nbytes_r, len_r
X jbe .L_do_copy1 /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X jmp .L_do_copy1
X
X cmpl nbytes_r, len_r
X jbe .L_do_copy1 /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X jmp .L_do_copy1
X
X.L_wrap_around_window:
X /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx =
dist
X * %ecx = nbytes, %eax = write, %eax = len
X *
X * else if (write < nbytes) {
X * from += wsize + write - nbytes;
X * nbytes -= write;
X * if (nbytes < len) {
X * len -= nbytes;
X * do {
X * PUP(out) = PUP(from);
X * } while (--nbytes);
X * from = window;
X * nbytes = write;
X * if (nbytes < len) {
X * len -= nbytes;
X * do {
X * PUP(out) = PUP(from);
X * } while(--nbytes);
X * from = out - dist;
X * }
X * }
X * }
X */
X#define write_r %eax
X movl write(%esp), write_r
X cmpl write_r, nbytes_r
X jbe .L_contiguous_in_window /* if (write >= nbytes) */
X
X addl wsize(%esp), from_r
X addl write_r, from_r
X subl nbytes_r, from_r /* from += wsize + write - nbytes */
X subl write_r, nbytes_r /* nbytes -= write */
X#undef write_r
X
X movl len(%esp), len_r
X cmpl nbytes_r, len_r
X jbe .L_do_copy1 /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl window(%esp), from_r /* from = window */
X movl write(%esp), nbytes_r /* nbytes = write */
X cmpl nbytes_r, len_r
X jbe .L_do_copy1 /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X jmp .L_do_copy1
X
X.L_contiguous_in_window:
X /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx =
dist
X * %ecx = nbytes, %eax = write, %eax = len
X *
X * else {
X * from += write - nbytes;
X * if (nbytes < len) {
X * len -= nbytes;
X * do {
X * PUP(out) = PUP(from);
X * } while (--nbytes);
X * from = out - dist;
X * }
X * }
X */
X#define write_r %eax
X addl write_r, from_r
X subl nbytes_r, from_r /* from += write - nbytes */
X#undef write_r
X
X movl len(%esp), len_r
X cmpl nbytes_r, len_r
X jbe .L_do_copy1 /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X
X.L_do_copy1:
X /* regs: %esi = from, %esi = in, %ebp = hold, %bl = bits, %edi = out
X * %eax = len
X *
X * while (len > 0) {
X * PUP(out) = PUP(from);
X * len--;
X * }
X * }
X * } while (in < last && out < end);
X */
X#undef nbytes_r
X#define in_r %esi
X movl len_r, %ecx
X rep movsb
X
X movl in(%esp), in_r /* move in back to %esi, toss from */
X jmp .L_while_test
X
X#undef len_r
X#undef dist_r
X
X#endif /* NO_MMX || RUN_TIME_MMX */
X
X
X/*** MMX code ***/
X
X#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
X
X.align 32,0x90
X.L_init_mmx:
X emms
X
X#undef bits_r
X#undef bitslong_r
X#define bitslong_r %ebp
X#define hold_mm %mm0
X movd %ebp, hold_mm
X movl %ebx, bitslong_r
X
X#define used_mm %mm1
X#define dmask2_mm %mm2
X#define lmask2_mm %mm3
X#define lmask_mm %mm4
X#define dmask_mm %mm5
X#define tmp_mm %mm6
X
X movd lmask(%esp), lmask_mm
X movq lmask_mm, lmask2_mm
X movd dmask(%esp), dmask_mm
X movq dmask_mm, dmask2_mm
X pxor used_mm, used_mm
X movl lcode(%esp), %ebx /* ebx = lcode */
X jmp .L_do_loop_mmx
X
X.align 32,0x90
X.L_while_test_mmx:
X /* while (in < last && out < end)
X */
X cmpl out_r, end(%esp)
X jbe .L_break_loop /* if (out >= end) */
X
X cmpl in_r, last(%esp)
X jbe .L_break_loop
X
X.L_do_loop_mmx:
X psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
X
X cmpl $32, bitslong_r
X ja .L_get_length_code_mmx /* if (32 < bits) */
X
X movd bitslong_r, tmp_mm
X movd (in_r), %mm7
X addl $4, in_r
X psllq tmp_mm, %mm7
X addl $32, bitslong_r
X por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits
*/
X
X.L_get_length_code_mmx:
X pand hold_mm, lmask_mm
X movd lmask_mm, %eax
X movq lmask2_mm, lmask_mm
X movl (%ebx,%eax,4), %eax /* eax = lcode[hold & lmask] */
X
X.L_dolen_mmx:
X movzbl %ah, %ecx /* ecx = this.bits */
X movd %ecx, used_mm
X subl %ecx, bitslong_r /* bits -= this.bits */
X
X testb %al, %al
X jnz .L_test_for_length_base_mmx /* if (op != 0) 45.7% */
X
X shrl $16, %eax /* output this.val char */
X stosb
X jmp .L_while_test_mmx
X
X.L_test_for_length_base_mmx:
X#define len_r %edx
X movl %eax, len_r /* len = this */
X shrl $16, len_r /* len = this.val */
X
X testb $16, %al
X jz .L_test_for_second_level_length_mmx /* if ((op & 16) == 0) 8%
*/
X andl $15, %eax /* op &= 15 */
X jz .L_decode_distance_mmx /* if (!op) */
X
X psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
X movd %eax, used_mm
X movd hold_mm, %ecx
X subl %eax, bitslong_r
X andl .L_mask(,%eax,4), %ecx
X addl %ecx, len_r /* len += hold & mask[op] */
X
X.L_decode_distance_mmx:
X psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
X
X cmpl $32, bitslong_r
X ja .L_get_dist_code_mmx /* if (32 < bits) */
X
X movd bitslong_r, tmp_mm
X movd (in_r), %mm7
X addl $4, in_r
X psllq tmp_mm, %mm7
X addl $32, bitslong_r
X por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits
*/
X
X.L_get_dist_code_mmx:
X movl dcode(%esp), %ebx /* ebx = dcode */
X pand hold_mm, dmask_mm
X movd dmask_mm, %eax
X movq dmask2_mm, dmask_mm
X movl (%ebx,%eax,4), %eax /* eax = dcode[hold & lmask] */
X
X.L_dodist_mmx:
X#define dist_r %ebx
X movzbl %ah, %ecx /* ecx = this.bits */
X movl %eax, dist_r
X shrl $16, dist_r /* dist = this.val */
X subl %ecx, bitslong_r /* bits -= this.bits */
X movd %ecx, used_mm
X
X testb $16, %al /* if ((op & 16) == 0) */
X jz .L_test_for_second_level_dist_mmx
X andl $15, %eax /* op &= 15 */
X jz .L_check_dist_one_mmx
X
X.L_add_bits_to_dist_mmx:
X psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
X movd %eax, used_mm /* save bit length of current op */
X movd hold_mm, %ecx /* get the next bits on input stream
*/
X subl %eax, bitslong_r /* bits -= op bits */
X andl .L_mask(,%eax,4), %ecx /* ecx = hold & mask[op] */
X addl %ecx, dist_r /* dist += hold & mask[op] */
X
X.L_check_window_mmx:
X movl in_r, in(%esp) /* save in so from can use it's reg
*/
X movl out_r, %eax
X subl beg(%esp), %eax /* nbytes = out - beg */
X
X cmpl dist_r, %eax
X jb .L_clip_window_mmx /* if (dist > nbytes) 4.2% */
X
X movl len_r, %ecx
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X
X subl $3, %ecx
X movb (from_r), %al
X movb %al, (out_r)
X movb 1(from_r), %al
X movb 2(from_r), %dl
X addl $3, from_r
X movb %al, 1(out_r)
X movb %dl, 2(out_r)
X addl $3, out_r
X rep movsb
X
X movl in(%esp), in_r /* move in back to %esi, toss from */
X movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist
*/
X jmp .L_while_test_mmx
X
X.align 16,0x90
X.L_check_dist_one_mmx:
X cmpl $1, dist_r
X jne .L_check_window_mmx
X cmpl out_r, beg(%esp)
X je .L_check_window_mmx
X
X decl out_r
X movl len_r, %ecx
X movb (out_r), %al
X subl $3, %ecx
X
X movb %al, 1(out_r)
X movb %al, 2(out_r)
X movb %al, 3(out_r)
X addl $4, out_r
X rep stosb
X
X movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist
*/
X jmp .L_while_test_mmx
X
X.align 16,0x90
X.L_test_for_second_level_length_mmx:
X testb $64, %al
X jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
X
X andl $15, %eax
X psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
X movd hold_mm, %ecx
X andl .L_mask(,%eax,4), %ecx
X addl len_r, %ecx
X movl (%ebx,%ecx,4), %eax /* eax = lcode[hold & lmask] */
X jmp .L_dolen_mmx
X
X.align 16,0x90
X.L_test_for_second_level_dist_mmx:
X testb $64, %al
X jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
X
X andl $15, %eax
X psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
X movd hold_mm, %ecx
X andl .L_mask(,%eax,4), %ecx
X movl dcode(%esp), %eax /* ecx = dcode */
X addl dist_r, %ecx
X movl (%eax,%ecx,4), %eax /* eax = lcode[hold & lmask] */
X jmp .L_dodist_mmx
X
X.align 16,0x90
X.L_clip_window_mmx:
X#define nbytes_r %ecx
X movl %eax, nbytes_r
X movl wsize(%esp), %eax /* prepare for dist compare */
X negl nbytes_r /* nbytes = -nbytes */
X movl window(%esp), from_r /* from = window */
X
X cmpl dist_r, %eax
X jb .L_invalid_distance_too_far /* if (dist > wsize) */
X
X addl dist_r, nbytes_r /* nbytes = dist - nbytes */
X cmpl $0, write(%esp)
X jne .L_wrap_around_window_mmx /* if (write != 0) */
X
X subl nbytes_r, %eax
X addl %eax, from_r /* from += wsize - nbytes */
X
X cmpl nbytes_r, len_r
X jbe .L_do_copy1_mmx /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X jmp .L_do_copy1_mmx
X
X cmpl nbytes_r, len_r
X jbe .L_do_copy1_mmx /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X jmp .L_do_copy1_mmx
X
X.L_wrap_around_window_mmx:
X#define write_r %eax
X movl write(%esp), write_r
X cmpl write_r, nbytes_r
X jbe .L_contiguous_in_window_mmx /* if (write >= nbytes) */
X
X addl wsize(%esp), from_r
X addl write_r, from_r
X subl nbytes_r, from_r /* from += wsize + write - nbytes */
X subl write_r, nbytes_r /* nbytes -= write */
X#undef write_r
X
X cmpl nbytes_r, len_r
X jbe .L_do_copy1_mmx /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl window(%esp), from_r /* from = window */
X movl write(%esp), nbytes_r /* nbytes = write */
X cmpl nbytes_r, len_r
X jbe .L_do_copy1_mmx /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X jmp .L_do_copy1_mmx
X
X.L_contiguous_in_window_mmx:
X#define write_r %eax
X addl write_r, from_r
X subl nbytes_r, from_r /* from += write - nbytes */
X#undef write_r
X
X cmpl nbytes_r, len_r
X jbe .L_do_copy1_mmx /* if (nbytes >= len) */
X
X subl nbytes_r, len_r /* len -= nbytes */
X rep movsb
X movl out_r, from_r
X subl dist_r, from_r /* from = out - dist */
X
X.L_do_copy1_mmx:
X#undef nbytes_r
X#define in_r %esi
X movl len_r, %ecx
X rep movsb
X
X movl in(%esp), in_r /* move in back to %esi, toss from */
X movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist
*/
X jmp .L_while_test_mmx
X
X#undef hold_r
X#undef bitslong_r
X
X#endif /* USE_MMX || RUN_TIME_MMX */
X
X
X/*** USE_MMX, NO_MMX, and RUNTIME_MMX from here on ***/
X
X.L_invalid_distance_code:
X /* else {
X * strm->msg = "invalid distance code";
X * state->mode = BAD;
X * }
X */
X movl $.L_invalid_distance_code_msg, %ecx
X movl $INFLATE_MODE_BAD, %edx
X jmp .L_update_stream_state
X
X.L_test_for_end_of_block:
X /* else if (op & 32) {
X * state->mode = TYPE;
X * break;
X * }
X */
X testb $32, %al
X jz .L_invalid_literal_length_code /* if ((op & 32) == 0) */
X
X movl $0, %ecx
X movl $INFLATE_MODE_TYPE, %edx
X jmp .L_update_stream_state
X
X.L_invalid_literal_length_code:
X /* else {
X * strm->msg = "invalid literal/length code";
X * state->mode = BAD;
X * }
X */
X movl $.L_invalid_literal_length_code_msg, %ecx
X movl $INFLATE_MODE_BAD, %edx
X jmp .L_update_stream_state
X
X.L_invalid_distance_too_far:
X /* strm->msg = "invalid distance too far back";
X * state->mode = BAD;
X */
X movl in(%esp), in_r /* from_r has in's reg, put in back
*/
X movl $.L_invalid_distance_too_far_msg, %ecx
X movl $INFLATE_MODE_BAD, %edx
X jmp .L_update_stream_state
X
X.L_update_stream_state:
X /* set strm->msg = %ecx, strm->state->mode = %edx */
X movl strm_sp(%esp), %eax
X testl %ecx, %ecx /* if (msg != NULL) */
X jz .L_skip_msg
X movl %ecx, msg_strm(%eax) /* strm->msg = msg */
X.L_skip_msg:
X movl state_strm(%eax), %eax /* state = strm->state */
X movl %edx, mode_state(%eax) /* state->mode = edx (BAD | TYPE) */
X jmp .L_break_loop
X
X.align 32,0x90
X.L_break_loop:
X
X/*
X * Regs:
X *
X * bits = %ebp when mmx, and in %ebx when non-mmx
X * hold = %hold_mm when mmx, and in %ebp when non-mmx
X * in = %esi
X * out = %edi
X */
X
X#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
X
X#if defined( RUN_TIME_MMX )
X
X cmpl $DO_USE_MMX, inflate_fast_use_mmx
X jne .L_update_next_in
X
X#endif /* RUN_TIME_MMX */
X
X movl %ebp, %ebx
X
X.L_update_next_in:
X
X#endif
X
X#define strm_r %eax
X#define state_r %edx
X
X /* len = bits >> 3;
X * in -= len;
X * bits -= len << 3;
X * hold &= (1U << bits) - 1;
X * state->hold = hold;
X * state->bits = bits;
X * strm->next_in = in;
X * strm->next_out = out;
X */
X movl strm_sp(%esp), strm_r
X movl %ebx, %ecx
X movl state_strm(strm_r), state_r
X shrl $3, %ecx
X subl %ecx, in_r
X shll $3, %ecx
X subl %ecx, %ebx
X movl out_r, next_out_strm(strm_r)
X movl %ebx, bits_state(state_r)
X movl %ebx, %ecx
X
X leal buf(%esp), %ebx
X cmpl %ebx, last(%esp)
X jne .L_buf_not_used /* if buf != last */
X
X subl %ebx, in_r /* in -= buf */
X movl next_in_strm(strm_r), %ebx
X movl %ebx, last(%esp) /* last = strm->next_in */
X addl %ebx, in_r /* in += strm->next_in */
X movl avail_in_strm(strm_r), %ebx
X subl $11, %ebx
X addl %ebx, last(%esp) /* last = &strm->next_in[ avail_in - 11 ]
*/
X
X.L_buf_not_used:
X movl in_r, next_in_strm(strm_r)
X
X movl $1, %ebx
X shll %cl, %ebx
X decl %ebx
X
X#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
X
X#if defined( RUN_TIME_MMX )
X
X cmpl $DO_USE_MMX, inflate_fast_use_mmx
X jne .L_update_hold
X
X#endif /* RUN_TIME_MMX */
X
X psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
X movd hold_mm, %ebp
X
X emms
X
X.L_update_hold:
X
X#endif /* USE_MMX || RUN_TIME_MMX */
X
X andl %ebx, %ebp
X movl %ebp, hold_state(state_r)
X
X#define last_r %ebx
X
X /* strm->avail_in = in < last ? 11 + (last - in) : 11 - (in - last)
*/
X movl last(%esp), last_r
X cmpl in_r, last_r
X jbe .L_last_is_smaller /* if (in >= last) */
X
X subl in_r, last_r /* last -= in */
X addl $11, last_r /* last += 11 */
X movl last_r, avail_in_strm(strm_r)
X jmp .L_fixup_out
X.L_last_is_smaller:
X subl last_r, in_r /* in -= last */
X negl in_r /* in = -in */
X addl $11, in_r /* in += 11 */
X movl in_r, avail_in_strm(strm_r)
X
X#undef last_r
X#define end_r %ebx
X
X.L_fixup_out:
X /* strm->avail_out = out < end ? 257 + (end - out) : 257 - (out -
end)*/
X movl end(%esp), end_r
X cmpl out_r, end_r
X jbe .L_end_is_smaller /* if (out >= end) */
X
X subl out_r, end_r /* end -= out */
X addl $257, end_r /* end += 257 */
X movl end_r, avail_out_strm(strm_r)
X jmp .L_done
X.L_end_is_smaller:
X subl end_r, out_r /* out -= end */
X negl out_r /* out = -out */
X addl $257, out_r /* out += 257 */
X movl out_r, avail_out_strm(strm_r)
X
X#undef end_r
X#undef strm_r
X#undef state_r
X
X.L_done:
X addl $local_var_size, %esp
X popf
X popl %ebx
X popl %ebp
X popl %esi
X popl %edi
X ret
X
X#if defined( GAS_ELF )
X/* elf info */
X.type inflate_fast, at function
X.size inflate_fast,.-inflate_fast
X#endif
END-of-contrib/inflate86/_inffast.S
exit
>Release-Note:
>Audit-Trail:
>Unformatted:
More information about the freebsd-bugs
mailing list