Remove old assembler code in which bugs have manifested.

In addition, there is not sufficient gain from the inflate
assembler code to warrant its inclusion.
This commit is contained in:
Mark Adler 2017-10-12 20:08:53 -07:00
parent a577351394
commit 288f108031
18 changed files with 0 additions and 6172 deletions

View File

@ -8,14 +8,6 @@ ada/ by Dmitriy Anisimkov <anisimkov@yahoo.com>
Support for Ada
See http://zlib-ada.sourceforge.net/
amd64/ by Mikhail Teterin <mi@ALDAN.algebra.com>
asm code for AMD64
See patch at http://www.freebsd.org/cgi/query-pr.cgi?pr=bin/96393
asm686/ by Brian Raiter <breadbox@muppetlabs.com>
asm code for Pentium and PPro/PII, using the AT&T (GNU as) syntax
See http://www.muppetlabs.com/~breadbox/software/assembly.html
blast/ by Mark Adler <madler@alumni.caltech.edu>
Decompressor for output of PKWare Data Compression Library (DCL)
@ -32,9 +24,6 @@ gcc_gvmat64/by Gilles Vollant <info@winimage.com>
infback9/ by Mark Adler <madler@alumni.caltech.edu>
Unsupported diffs to infback to decode the deflate64 format
inflate86/ by Chris Anderson <christop@charm.net>
Tuned x86 gcc asm code to replace inflate_fast()
iostream/ by Kevin Ruland <kevin@rodin.wustl.edu>
A C++ I/O streams interface to the zlib gz* functions
@ -45,16 +34,6 @@ iostream3/ by Ludwig Schwardt <schwardt@sun.ac.za>
and Kevin Ruland <kevin@rodin.wustl.edu>
Yet another C++ I/O streams interface
masmx64/ by Gilles Vollant <info@winimage.com>
x86 64-bit (AMD64 and Intel EM64t) code for x64 assembler to
replace longest_match() and inflate_fast(), also masm x86
64-bits translation of Chris Anderson inflate_fast()
masmx86/ by Gilles Vollant <info@winimage.com>
x86 asm code to replace longest_match() and inflate_fast(),
for Visual C++ and MASM (32 bits).
Based on Brian Raiter (asm686) and Chris Anderson (inflate86)
minizip/ by Gilles Vollant <info@winimage.com>
Mini zip and unzip based on zlib
Includes Zip64 support by Mathias Svensson <mathias@result42.com>

View File

@ -1,452 +0,0 @@
/*
* match.S -- optimized version of longest_match()
* based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the BSD License. Use by owners of Che Guevarra
* parafernalia is prohibited, where possible, and highly discouraged
* elsewhere.
*/
#ifndef NO_UNDERLINE
# define match_init _match_init
# define longest_match _longest_match
#endif
#define scanend ebx
#define scanendw bx
#define chainlenwmask edx /* high word: current chain len low word: s->wmask */
#define curmatch rsi
#define curmatchd esi
#define windowbestlen r8
#define scanalign r9
#define scanalignd r9d
#define window r10
#define bestlen r11
#define bestlend r11d
#define scanstart r12d
#define scanstartw r12w
#define scan r13
#define nicematch r14d
#define limit r15
#define limitd r15d
#define prev rcx
/*
* The 258 is a "magic number, not a parameter -- changing it
* breaks the hell loose
*/
#define MAX_MATCH (258)
#define MIN_MATCH (3)
#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
/* stack frame offsets */
#define LocalVarsSize (112)
#define _chainlenwmask ( 8-LocalVarsSize)(%rsp)
#define _windowbestlen (16-LocalVarsSize)(%rsp)
#define save_r14 (24-LocalVarsSize)(%rsp)
#define save_rsi (32-LocalVarsSize)(%rsp)
#define save_rbx (40-LocalVarsSize)(%rsp)
#define save_r12 (56-LocalVarsSize)(%rsp)
#define save_r13 (64-LocalVarsSize)(%rsp)
#define save_r15 (80-LocalVarsSize)(%rsp)
.globl match_init, longest_match
/*
* On AMD64 the first argument of a function (in our case -- the pointer to
* deflate_state structure) is passed in %rdi, hence our offsets below are
* all off of that.
*/
/* you can check the structure offset by running
#include <stdlib.h>
#include <stdio.h>
#include "deflate.h"
void print_depl()
{
deflate_state ds;
deflate_state *s=&ds;
printf("size pointer=%u\n",(int)sizeof(void*));
printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
}
*/
/*
to compile for XCode 3.2 on MacOSX x86_64
- run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
*/
#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
#define dsWSize ( 68)(%rdi)
#define dsWMask ( 76)(%rdi)
#define dsWindow ( 80)(%rdi)
#define dsPrev ( 96)(%rdi)
#define dsMatchLen (144)(%rdi)
#define dsPrevMatch (148)(%rdi)
#define dsStrStart (156)(%rdi)
#define dsMatchStart (160)(%rdi)
#define dsLookahead (164)(%rdi)
#define dsPrevLen (168)(%rdi)
#define dsMaxChainLen (172)(%rdi)
#define dsGoodMatch (188)(%rdi)
#define dsNiceMatch (192)(%rdi)
#else
#ifndef STRUCT_OFFSET
# define STRUCT_OFFSET (0)
#endif
#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
#define dsPrev ( 88 + STRUCT_OFFSET)(%rdi)
#define dsMatchLen (136 + STRUCT_OFFSET)(%rdi)
#define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi)
#define dsStrStart (148 + STRUCT_OFFSET)(%rdi)
#define dsMatchStart (152 + STRUCT_OFFSET)(%rdi)
#define dsLookahead (156 + STRUCT_OFFSET)(%rdi)
#define dsPrevLen (160 + STRUCT_OFFSET)(%rdi)
#define dsMaxChainLen (164 + STRUCT_OFFSET)(%rdi)
#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
#endif
.text
/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
longest_match:
/*
* Retrieve the function arguments. %curmatch will hold cur_match
* throughout the entire function (passed via rsi on amd64).
* rdi will hold the pointer to the deflate_state (first arg on amd64)
*/
mov %rsi, save_rsi
mov %rbx, save_rbx
mov %r12, save_r12
mov %r13, save_r13
mov %r14, save_r14
mov %r15, save_r15
/* uInt wmask = s->w_mask; */
/* unsigned chain_length = s->max_chain_length; */
/* if (s->prev_length >= s->good_match) { */
/* chain_length >>= 2; */
/* } */
movl dsPrevLen, %eax
movl dsGoodMatch, %ebx
cmpl %ebx, %eax
movl dsWMask, %eax
movl dsMaxChainLen, %chainlenwmask
jl LastMatchGood
shrl $2, %chainlenwmask
LastMatchGood:
/* chainlen is decremented once beforehand so that the function can */
/* use the sign flag instead of the zero flag for the exit test. */
/* It is then shifted into the high word, to make room for the wmask */
/* value, which it will always accompany. */
decl %chainlenwmask
shll $16, %chainlenwmask
orl %eax, %chainlenwmask
/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
movl dsNiceMatch, %eax
movl dsLookahead, %ebx
cmpl %eax, %ebx
jl LookaheadLess
movl %eax, %ebx
LookaheadLess: movl %ebx, %nicematch
/* register Bytef *scan = s->window + s->strstart; */
mov dsWindow, %window
movl dsStrStart, %limitd
lea (%limit, %window), %scan
/* Determine how many bytes the scan ptr is off from being */
/* dword-aligned. */
mov %scan, %scanalign
negl %scanalignd
andl $3, %scanalignd
/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
movl dsWSize, %eax
subl $MIN_LOOKAHEAD, %eax
xorl %ecx, %ecx
subl %eax, %limitd
cmovng %ecx, %limitd
/* int best_len = s->prev_length; */
movl dsPrevLen, %bestlend
/* Store the sum of s->window + best_len in %windowbestlen locally, and in memory. */
lea (%window, %bestlen), %windowbestlen
mov %windowbestlen, _windowbestlen
/* register ush scan_start = *(ushf*)scan; */
/* register ush scan_end = *(ushf*)(scan+best_len-1); */
/* Posf *prev = s->prev; */
movzwl (%scan), %scanstart
movzwl -1(%scan, %bestlen), %scanend
mov dsPrev, %prev
/* Jump into the main loop. */
movl %chainlenwmask, _chainlenwmask
jmp LoopEntry
.balign 16
/* do {
* match = s->window + cur_match;
* if (*(ushf*)(match+best_len-1) != scan_end ||
* *(ushf*)match != scan_start) continue;
* [...]
* } while ((cur_match = prev[cur_match & wmask]) > limit
* && --chain_length != 0);
*
* Here is the inner loop of the function. The function will spend the
* majority of its time in this loop, and majority of that time will
* be spent in the first ten instructions.
*/
LookupLoop:
andl %chainlenwmask, %curmatchd
movzwl (%prev, %curmatch, 2), %curmatchd
cmpl %limitd, %curmatchd
jbe LeaveNow
subl $0x00010000, %chainlenwmask
js LeaveNow
LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
jne LookupLoop
cmpw %scanstartw, (%window, %curmatch)
jne LookupLoop
/* Store the current value of chainlen. */
movl %chainlenwmask, _chainlenwmask
/* %scan is the string under scrutiny, and %prev to the string we */
/* are hoping to match it up with. In actuality, %esi and %edi are */
/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
/* initialized to -(MAX_MATCH_8 - scanalign). */
mov $(-MAX_MATCH_8), %rdx
lea (%curmatch, %window), %windowbestlen
lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
lea MAX_MATCH_8(%scan, %scanalign), %prev
/* the prefetching below makes very little difference... */
prefetcht1 (%windowbestlen, %rdx)
prefetcht1 (%prev, %rdx)
/*
* Test the strings for equality, 8 bytes at a time. At the end,
* adjust %rdx so that it is offset to the exact byte that mismatched.
*
* It should be confessed that this loop usually does not represent
* much of the total running time. Replacing it with a more
* straightforward "rep cmpsb" would not drastically degrade
* performance -- unrolling it, for example, makes no difference.
*/
#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
LoopCmps:
#ifdef USE_SSE
/* Preload the SSE registers */
movdqu (%windowbestlen, %rdx), %xmm1
movdqu (%prev, %rdx), %xmm2
pcmpeqb %xmm2, %xmm1
movdqu 16(%windowbestlen, %rdx), %xmm3
movdqu 16(%prev, %rdx), %xmm4
pcmpeqb %xmm4, %xmm3
movdqu 32(%windowbestlen, %rdx), %xmm5
movdqu 32(%prev, %rdx), %xmm6
pcmpeqb %xmm6, %xmm5
movdqu 48(%windowbestlen, %rdx), %xmm7
movdqu 48(%prev, %rdx), %xmm8
pcmpeqb %xmm8, %xmm7
/* Check the comparisions' results */
pmovmskb %xmm1, %rax
notw %ax
bsfw %ax, %ax
jnz LeaveLoopCmps
/* this is the only iteration of the loop with a possibility of having
incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40
and (0x40*4)+8=0x108 */
add $8, %rdx
jz LenMaximum
add $8, %rdx
pmovmskb %xmm3, %rax
notw %ax
bsfw %ax, %ax
jnz LeaveLoopCmps
add $16, %rdx
pmovmskb %xmm5, %rax
notw %ax
bsfw %ax, %ax
jnz LeaveLoopCmps
add $16, %rdx
pmovmskb %xmm7, %rax
notw %ax
bsfw %ax, %ax
jnz LeaveLoopCmps
add $16, %rdx
jmp LoopCmps
LeaveLoopCmps: add %rax, %rdx
#else
mov (%windowbestlen, %rdx), %rax
xor (%prev, %rdx), %rax
jnz LeaveLoopCmps
mov 8(%windowbestlen, %rdx), %rax
xor 8(%prev, %rdx), %rax
jnz LeaveLoopCmps8
mov 16(%windowbestlen, %rdx), %rax
xor 16(%prev, %rdx), %rax
jnz LeaveLoopCmps16
add $24, %rdx
jnz LoopCmps
jmp LenMaximum
# if 0
/*
* This three-liner is tantalizingly simple, but bsf is a slow instruction,
* and the complicated alternative down below is quite a bit faster. Sad...
*/
LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
shrl $3, %eax /* divide by 8 to get the byte */
add %rax, %rdx
# else
LeaveLoopCmps16:
add $8, %rdx
LeaveLoopCmps8:
add $8, %rdx
LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
jnz Check16
add $4, %rdx
shr $32, %rax
Check16: testw $0xFFFF, %ax
jnz LenLower
add $2, %rdx
shrl $16, %eax
LenLower: subb $1, %al
adc $0, %rdx
# endif
#endif
/* Calculate the length of the match. If it is longer than MAX_MATCH, */
/* then automatically accept it as the best possible match and leave. */
lea (%prev, %rdx), %rax
sub %scan, %rax
cmpl $MAX_MATCH, %eax
jge LenMaximum
/* If the length of the match is not longer than the best match we */
/* have so far, then forget it and return to the lookup loop. */
cmpl %bestlend, %eax
jg LongerMatch
mov _windowbestlen, %windowbestlen
mov dsPrev, %prev
movl _chainlenwmask, %edx
jmp LookupLoop
/* s->match_start = cur_match; */
/* best_len = len; */
/* if (len >= nice_match) break; */
/* scan_end = *(ushf*)(scan+best_len-1); */
LongerMatch:
movl %eax, %bestlend
movl %curmatchd, dsMatchStart
cmpl %nicematch, %eax
jge LeaveNow
lea (%window, %bestlen), %windowbestlen
mov %windowbestlen, _windowbestlen
movzwl -1(%scan, %rax), %scanend
mov dsPrev, %prev
movl _chainlenwmask, %chainlenwmask
jmp LookupLoop
/* Accept the current string, with the maximum possible length. */
LenMaximum:
movl $MAX_MATCH, %bestlend
movl %curmatchd, dsMatchStart
/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
/* return s->lookahead; */
LeaveNow:
movl dsLookahead, %eax
cmpl %eax, %bestlend
cmovngl %bestlend, %eax
LookaheadRet:
/* Restore the registers and return from whence we came. */
mov save_rsi, %rsi
mov save_rbx, %rbx
mov save_r12, %r12
mov save_r13, %r13
mov save_r14, %r14
mov save_r15, %r15
ret
match_init: ret

View File

@ -1,51 +0,0 @@
This is a patched version of zlib, modified to use
Pentium-Pro-optimized assembly code in the deflation algorithm. The
files changed/added by this patch are:
README.686
match.S
The speedup that this patch provides varies, depending on whether the
compiler used to build the original version of zlib falls afoul of the
PPro's speed traps. My own tests show a speedup of around 10-20% at
the default compression level, and 20-30% using -9, against a version
compiled using gcc 2.7.2.3. Your mileage may vary.
Note that this code has been tailored for the PPro/PII in particular,
and will not perform particuarly well on a Pentium.
If you are using an assembler other than GNU as, you will have to
translate match.S to use your assembler's syntax. (Have fun.)
Brian Raiter
breadbox@muppetlabs.com
April, 1998
Added for zlib 1.1.3:
The patches come from
http://www.muppetlabs.com/~breadbox/software/assembly.html
To compile zlib with this asm file, copy match.S to the zlib directory
then do:
CFLAGS="-O3 -DASMV" ./configure
make OBJA=match.o
Update:
I've been ignoring these assembly routines for years, believing that
gcc's generated code had caught up with it sometime around gcc 2.95
and the major rearchitecting of the Pentium 4. However, I recently
learned that, despite what I believed, this code still has some life
in it. On the Pentium 4 and AMD64 chips, it continues to run about 8%
faster than the code produced by gcc 4.1.
In acknowledgement of its continuing usefulness, I've altered the
license to match that of the rest of zlib. Share and Enjoy!
Brian Raiter
breadbox@muppetlabs.com
April, 2007

View File

@ -1,357 +0,0 @@
/* match.S -- x86 assembly version of the zlib longest_match() function.
* Optimized for the Intel 686 chips (PPro and later).
*
* Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#ifndef NO_UNDERLINE
#define match_init _match_init
#define longest_match _longest_match
#endif
#define MAX_MATCH (258)
#define MIN_MATCH (3)
#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
/* stack frame offsets */
#define chainlenwmask 0 /* high word: current chain len */
/* low word: s->wmask */
#define window 4 /* local copy of s->window */
#define windowbestlen 8 /* s->window + bestlen */
#define scanstart 16 /* first two bytes of string */
#define scanend 12 /* last two bytes of string */
#define scanalign 20 /* dword-misalignment of string */
#define nicematch 24 /* a good enough match size */
#define bestlen 28 /* size of best match so far */
#define scan 32 /* ptr to string wanting match */
#define LocalVarsSize (36)
/* saved ebx 36 */
/* saved edi 40 */
/* saved esi 44 */
/* saved ebp 48 */
/* return address 52 */
#define deflatestate 56 /* the function arguments */
#define curmatch 60
/* All the +zlib1222add offsets are due to the addition of fields
* in zlib in the deflate_state structure since the asm code was first written
* (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
* (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
* if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
*/
#define zlib1222add (8)
#define dsWSize (36+zlib1222add)
#define dsWMask (44+zlib1222add)
#define dsWindow (48+zlib1222add)
#define dsPrev (56+zlib1222add)
#define dsMatchLen (88+zlib1222add)
#define dsPrevMatch (92+zlib1222add)
#define dsStrStart (100+zlib1222add)
#define dsMatchStart (104+zlib1222add)
#define dsLookahead (108+zlib1222add)
#define dsPrevLen (112+zlib1222add)
#define dsMaxChainLen (116+zlib1222add)
#define dsGoodMatch (132+zlib1222add)
#define dsNiceMatch (136+zlib1222add)
.file "match.S"
.globl match_init, longest_match
.text
/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
.cfi_sections .debug_frame
longest_match:
.cfi_startproc
/* Save registers that the compiler may be using, and adjust %esp to */
/* make room for our stack frame. */
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset ebp, -8
pushl %edi
.cfi_def_cfa_offset 12
pushl %esi
.cfi_def_cfa_offset 16
pushl %ebx
.cfi_def_cfa_offset 20
subl $LocalVarsSize, %esp
.cfi_def_cfa_offset LocalVarsSize+20
/* Retrieve the function arguments. %ecx will hold cur_match */
/* throughout the entire function. %edx will hold the pointer to the */
/* deflate_state structure during the function's setup (before */
/* entering the main loop). */
movl deflatestate(%esp), %edx
movl curmatch(%esp), %ecx
/* uInt wmask = s->w_mask; */
/* unsigned chain_length = s->max_chain_length; */
/* if (s->prev_length >= s->good_match) { */
/* chain_length >>= 2; */
/* } */
movl dsPrevLen(%edx), %eax
movl dsGoodMatch(%edx), %ebx
cmpl %ebx, %eax
movl dsWMask(%edx), %eax
movl dsMaxChainLen(%edx), %ebx
jl LastMatchGood
shrl $2, %ebx
LastMatchGood:
/* chainlen is decremented once beforehand so that the function can */
/* use the sign flag instead of the zero flag for the exit test. */
/* It is then shifted into the high word, to make room for the wmask */
/* value, which it will always accompany. */
decl %ebx
shll $16, %ebx
orl %eax, %ebx
movl %ebx, chainlenwmask(%esp)
/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
movl dsNiceMatch(%edx), %eax
movl dsLookahead(%edx), %ebx
cmpl %eax, %ebx
jl LookaheadLess
movl %eax, %ebx
LookaheadLess: movl %ebx, nicematch(%esp)
/* register Bytef *scan = s->window + s->strstart; */
movl dsWindow(%edx), %esi
movl %esi, window(%esp)
movl dsStrStart(%edx), %ebp
lea (%esi,%ebp), %edi
movl %edi, scan(%esp)
/* Determine how many bytes the scan ptr is off from being */
/* dword-aligned. */
movl %edi, %eax
negl %eax
andl $3, %eax
movl %eax, scanalign(%esp)
/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
movl dsWSize(%edx), %eax
subl $MIN_LOOKAHEAD, %eax
subl %eax, %ebp
jg LimitPositive
xorl %ebp, %ebp
LimitPositive:
/* int best_len = s->prev_length; */
movl dsPrevLen(%edx), %eax
movl %eax, bestlen(%esp)
/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
addl %eax, %esi
movl %esi, windowbestlen(%esp)
/* register ush scan_start = *(ushf*)scan; */
/* register ush scan_end = *(ushf*)(scan+best_len-1); */
/* Posf *prev = s->prev; */
movzwl (%edi), %ebx
movl %ebx, scanstart(%esp)
movzwl -1(%edi,%eax), %ebx
movl %ebx, scanend(%esp)
movl dsPrev(%edx), %edi
/* Jump into the main loop. */
movl chainlenwmask(%esp), %edx
jmp LoopEntry
.balign 16
/* do {
* match = s->window + cur_match;
* if (*(ushf*)(match+best_len-1) != scan_end ||
* *(ushf*)match != scan_start) continue;
* [...]
* } while ((cur_match = prev[cur_match & wmask]) > limit
* && --chain_length != 0);
*
* Here is the inner loop of the function. The function will spend the
* majority of its time in this loop, and majority of that time will
* be spent in the first ten instructions.
*
* Within this loop:
* %ebx = scanend
* %ecx = curmatch
* %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
* %esi = windowbestlen - i.e., (window + bestlen)
* %edi = prev
* %ebp = limit
*/
LookupLoop:
andl %edx, %ecx
movzwl (%edi,%ecx,2), %ecx
cmpl %ebp, %ecx
jbe LeaveNow
subl $0x00010000, %edx
js LeaveNow
LoopEntry: movzwl -1(%esi,%ecx), %eax
cmpl %ebx, %eax
jnz LookupLoop
movl window(%esp), %eax
movzwl (%eax,%ecx), %eax
cmpl scanstart(%esp), %eax
jnz LookupLoop
/* Store the current value of chainlen. */
movl %edx, chainlenwmask(%esp)
/* Point %edi to the string under scrutiny, and %esi to the string we */
/* are hoping to match it up with. In actuality, %esi and %edi are */
/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
/* initialized to -(MAX_MATCH_8 - scanalign). */
movl window(%esp), %esi
movl scan(%esp), %edi
addl %ecx, %esi
movl scanalign(%esp), %eax
movl $(-MAX_MATCH_8), %edx
lea MAX_MATCH_8(%edi,%eax), %edi
lea MAX_MATCH_8(%esi,%eax), %esi
/* Test the strings for equality, 8 bytes at a time. At the end,
* adjust %edx so that it is offset to the exact byte that mismatched.
*
* We already know at this point that the first three bytes of the
* strings match each other, and they can be safely passed over before
* starting the compare loop. So what this code does is skip over 0-3
* bytes, as much as necessary in order to dword-align the %edi
* pointer. (%esi will still be misaligned three times out of four.)
*
* It should be confessed that this loop usually does not represent
* much of the total running time. Replacing it with a more
* straightforward "rep cmpsb" would not drastically degrade
* performance.
*/
LoopCmps:
movl (%esi,%edx), %eax
xorl (%edi,%edx), %eax
jnz LeaveLoopCmps
movl 4(%esi,%edx), %eax
xorl 4(%edi,%edx), %eax
jnz LeaveLoopCmps4
addl $8, %edx
jnz LoopCmps
jmp LenMaximum
LeaveLoopCmps4: addl $4, %edx
LeaveLoopCmps: testl $0x0000FFFF, %eax
jnz LenLower
addl $2, %edx
shrl $16, %eax
LenLower: subb $1, %al
adcl $0, %edx
/* Calculate the length of the match. If it is longer than MAX_MATCH, */
/* then automatically accept it as the best possible match and leave. */
lea (%edi,%edx), %eax
movl scan(%esp), %edi
subl %edi, %eax
cmpl $MAX_MATCH, %eax
jge LenMaximum
/* If the length of the match is not longer than the best match we */
/* have so far, then forget it and return to the lookup loop. */
movl deflatestate(%esp), %edx
movl bestlen(%esp), %ebx
cmpl %ebx, %eax
jg LongerMatch
movl windowbestlen(%esp), %esi
movl dsPrev(%edx), %edi
movl scanend(%esp), %ebx
movl chainlenwmask(%esp), %edx
jmp LookupLoop
/* s->match_start = cur_match; */
/* best_len = len; */
/* if (len >= nice_match) break; */
/* scan_end = *(ushf*)(scan+best_len-1); */
LongerMatch: movl nicematch(%esp), %ebx
movl %eax, bestlen(%esp)
movl %ecx, dsMatchStart(%edx)
cmpl %ebx, %eax
jge LeaveNow
movl window(%esp), %esi
addl %eax, %esi
movl %esi, windowbestlen(%esp)
movzwl -1(%edi,%eax), %ebx
movl dsPrev(%edx), %edi
movl %ebx, scanend(%esp)
movl chainlenwmask(%esp), %edx
jmp LookupLoop
/* Accept the current string, with the maximum possible length. */
LenMaximum: movl deflatestate(%esp), %edx
movl $MAX_MATCH, bestlen(%esp)
movl %ecx, dsMatchStart(%edx)
/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
/* return s->lookahead; */
LeaveNow:
movl deflatestate(%esp), %edx
movl bestlen(%esp), %ebx
movl dsLookahead(%edx), %eax
cmpl %eax, %ebx
jg LookaheadRet
movl %ebx, %eax
LookaheadRet:
/* Restore the stack and return from whence we came. */
addl $LocalVarsSize, %esp
.cfi_def_cfa_offset 20
popl %ebx
.cfi_def_cfa_offset 16
popl %esi
.cfi_def_cfa_offset 12
popl %edi
.cfi_def_cfa_offset 8
popl %ebp
.cfi_def_cfa_offset 4
.cfi_endproc
match_init: ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +0,0 @@
ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
ml64.exe /Flgvmat64 /c /Zi gvmat64.asm

View File

@ -1,553 +0,0 @@
;uInt longest_match_x64(
; deflate_state *s,
; IPos cur_match); /* current match */
; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
; (AMD64 on Athlon 64, Opteron, Phenom
; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
;
; File written by Gilles Vollant, by converting to assembly the longest_match
; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
;
; and by taking inspiration on asm686 with masm, optimised assembly code
; from Brian Raiter, written 1998
;
; This software is provided 'as-is', without any express or implied
; warranty. In no event will the authors be held liable for any damages
; arising from the use of this software.
;
; Permission is granted to anyone to use this software for any purpose,
; including commercial applications, and to alter it and redistribute it
; freely, subject to the following restrictions:
;
; 1. The origin of this software must not be misrepresented; you must not
; claim that you wrote the original software. If you use this software
; in a product, an acknowledgment in the product documentation would be
; appreciated but is not required.
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software
; 3. This notice may not be removed or altered from any source distribution.
;
;
;
; http://www.zlib.net
; http://www.winimage.com/zLibDll
; http://www.muppetlabs.com/~breadbox/software/assembly.html
;
; to compile this file for infozip Zip, I use option:
; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
;
; to compile this file for zLib, I use option:
; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
; Be carrefull to adapt zlib1222add below to your version of zLib
; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
; value of zlib1222add later)
;
; This file compile with Microsoft Macro Assembler (x64) for AMD64
;
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
;
; (you can get Windows WDK with ml64 for AMD64 from
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
;
;uInt longest_match(s, cur_match)
; deflate_state *s;
; IPos cur_match; /* current match */
.code
longest_match PROC
;LocalVarsSize equ 88
LocalVarsSize equ 72
; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
; free register : r14,r15
; register can be saved : rsp
chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
; low word: s->wmask
;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
IFDEF INFOZIP
ELSE
nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size
ENDIF
save_rdi equ rsp + 24 - LocalVarsSize
save_rsi equ rsp + 32 - LocalVarsSize
save_rbx equ rsp + 40 - LocalVarsSize
save_rbp equ rsp + 48 - LocalVarsSize
save_r12 equ rsp + 56 - LocalVarsSize
save_r13 equ rsp + 64 - LocalVarsSize
;save_r14 equ rsp + 72 - LocalVarsSize
;save_r15 equ rsp + 80 - LocalVarsSize
; summary of register usage
; scanend ebx
; scanendw bx
; chainlenwmask edx
; curmatch rsi
; curmatchd esi
; windowbestlen r8
; scanalign r9
; scanalignd r9d
; window r10
; bestlen r11
; bestlend r11d
; scanstart r12d
; scanstartw r12w
; scan r13
; nicematch r14d
; limit r15
; limitd r15d
; prev rcx
; all the +4 offsets are due to the addition of pending_buf_size (in zlib
; in the deflate_state structure since the asm code was first written
; (if you compile with zlib 1.0.4 or older, remove the +4).
; Note : these value are good with a 8 bytes boundary pack structure
MAX_MATCH equ 258
MIN_MATCH equ 3
MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
;;; Offsets for fields in the deflate_state structure. These numbers
;;; are calculated from the definition of deflate_state, with the
;;; assumption that the compiler will dword-align the fields. (Thus,
;;; changing the definition of deflate_state could easily cause this
;;; program to crash horribly, without so much as a warning at
;;; compile time. Sigh.)
; all the +zlib1222add offsets are due to the addition of fields
; in zlib in the deflate_state structure since the asm code was first written
; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
IFDEF INFOZIP
_DATA SEGMENT
COMM window_size:DWORD
; WMask ; 7fff
COMM window:BYTE:010040H
COMM prev:WORD:08000H
; MatchLen : unused
; PrevMatch : unused
COMM strstart:DWORD
COMM match_start:DWORD
; Lookahead : ignore
COMM prev_length:DWORD ; PrevLen
COMM max_chain_length:DWORD
COMM good_match:DWORD
COMM nice_match:DWORD
prev_ad equ OFFSET prev
window_ad equ OFFSET window
nicematch equ nice_match
_DATA ENDS
WMask equ 07fffh
ELSE
IFNDEF zlib1222add
zlib1222add equ 8
ENDIF
dsWSize equ 56+zlib1222add+(zlib1222add/2)
dsWMask equ 64+zlib1222add+(zlib1222add/2)
dsWindow equ 72+zlib1222add
dsPrev equ 88+zlib1222add
dsMatchLen equ 128+zlib1222add
dsPrevMatch equ 132+zlib1222add
dsStrStart equ 140+zlib1222add
dsMatchStart equ 144+zlib1222add
dsLookahead equ 148+zlib1222add
dsPrevLen equ 152+zlib1222add
dsMaxChainLen equ 156+zlib1222add
dsGoodMatch equ 172+zlib1222add
dsNiceMatch equ 176+zlib1222add
window_size equ [ rcx + dsWSize]
WMask equ [ rcx + dsWMask]
window_ad equ [ rcx + dsWindow]
prev_ad equ [ rcx + dsPrev]
strstart equ [ rcx + dsStrStart]
match_start equ [ rcx + dsMatchStart]
Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
prev_length equ [ rcx + dsPrevLen]
max_chain_length equ [ rcx + dsMaxChainLen]
good_match equ [ rcx + dsGoodMatch]
nice_match equ [ rcx + dsNiceMatch]
ENDIF
; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
;
; All registers must be preserved across the call, except for
; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
;;; Save registers that the compiler may be using, and adjust esp to
;;; make room for our stack frame.
;;; Retrieve the function arguments. r8d will hold cur_match
;;; throughout the entire function. edx will hold the pointer to the
;;; deflate_state structure during the function's setup (before
;;; entering the main loop.
; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
mov [save_rdi],rdi
mov [save_rsi],rsi
mov [save_rbx],rbx
mov [save_rbp],rbp
IFDEF INFOZIP
mov r8d,ecx
ELSE
mov r8d,edx
ENDIF
mov [save_r12],r12
mov [save_r13],r13
; mov [save_r14],r14
; mov [save_r15],r15
;;; uInt wmask = s->w_mask;
;;; unsigned chain_length = s->max_chain_length;
;;; if (s->prev_length >= s->good_match) {
;;; chain_length >>= 2;
;;; }
mov edi, prev_length
mov esi, good_match
mov eax, WMask
mov ebx, max_chain_length
cmp edi, esi
jl LastMatchGood
shr ebx, 2
LastMatchGood:
;;; chainlen is decremented once beforehand so that the function can
;;; use the sign flag instead of the zero flag for the exit test.
;;; It is then shifted into the high word, to make room for the wmask
;;; value, which it will always accompany.
dec ebx
shl ebx, 16
or ebx, eax
;;; on zlib only
;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
IFDEF INFOZIP
mov [chainlenwmask], ebx
; on infozip nice_match = [nice_match]
ELSE
mov eax, nice_match
mov [chainlenwmask], ebx
mov r10d, Lookahead
cmp r10d, eax
cmovnl r10d, eax
mov [nicematch],r10d
ENDIF
;;; register Bytef *scan = s->window + s->strstart;
mov r10, window_ad
mov ebp, strstart
lea r13, [r10 + rbp]
;;; Determine how many bytes the scan ptr is off from being
;;; dword-aligned.
mov r9,r13
neg r13
and r13,3
;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
IFDEF INFOZIP
mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
ELSE
mov eax, window_size
sub eax, MIN_LOOKAHEAD
ENDIF
xor edi,edi
sub ebp, eax
mov r11d, prev_length
cmovng ebp,edi
;;; int best_len = s->prev_length;
;;; Store the sum of s->window + best_len in esi locally, and in esi.
lea rsi,[r10+r11]
;;; register ush scan_start = *(ushf*)scan;
;;; register ush scan_end = *(ushf*)(scan+best_len-1);
;;; Posf *prev = s->prev;
movzx r12d,word ptr [r9]
movzx ebx, word ptr [r9 + r11 - 1]
mov rdi, prev_ad
;;; Jump into the main loop.
mov edx, [chainlenwmask]
cmp bx,word ptr [rsi + r8 - 1]
jz LookupLoopIsZero
LookupLoop1:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry1:
cmp bx,word ptr [rsi + r8 - 1]
jz LookupLoopIsZero
LookupLoop2:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry2:
cmp bx,word ptr [rsi + r8 - 1]
jz LookupLoopIsZero
LookupLoop4:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry4:
cmp bx,word ptr [rsi + r8 - 1]
jnz LookupLoop1
jmp LookupLoopIsZero
;;; do {
;;; match = s->window + cur_match;
;;; if (*(ushf*)(match+best_len-1) != scan_end ||
;;; *(ushf*)match != scan_start) continue;
;;; [...]
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
;;; && --chain_length != 0);
;;;
;;; Here is the inner loop of the function. The function will spend the
;;; majority of its time in this loop, and majority of that time will
;;; be spent in the first ten instructions.
;;;
;;; Within this loop:
;;; ebx = scanend
;;; r8d = curmatch
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
;;; esi = windowbestlen - i.e., (window + bestlen)
;;; edi = prev
;;; ebp = limit
LookupLoop:
and r8d, edx
movzx r8d, word ptr [rdi + r8*2]
cmp r8d, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry:
cmp bx,word ptr [rsi + r8 - 1]
jnz LookupLoop1
LookupLoopIsZero:
cmp r12w, word ptr [r10 + r8]
jnz LookupLoop1
;;; Store the current value of chainlen.
mov [chainlenwmask], edx
;;; Point edi to the string under scrutiny, and esi to the string we
;;; are hoping to match it up with. In actuality, esi and edi are
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
;;; initialized to -(MAX_MATCH_8 - scanalign).
lea rsi,[r8+r10]
mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
prefetcht1 [rsi+rdx]
prefetcht1 [rdi+rdx]
;;; Test the strings for equality, 8 bytes at a time. At the end,
;;; adjust rdx so that it is offset to the exact byte that mismatched.
;;;
;;; We already know at this point that the first three bytes of the
;;; strings match each other, and they can be safely passed over before
;;; starting the compare loop. So what this code does is skip over 0-3
;;; bytes, as much as necessary in order to dword-align the edi
;;; pointer. (rsi will still be misaligned three times out of four.)
;;;
;;; It should be confessed that this loop usually does not represent
;;; much of the total running time. Replacing it with a more
;;; straightforward "rep cmpsb" would not drastically degrade
;;; performance.
LoopCmps:
mov rax, [rsi + rdx]
xor rax, [rdi + rdx]
jnz LeaveLoopCmps
mov rax, [rsi + rdx + 8]
xor rax, [rdi + rdx + 8]
jnz LeaveLoopCmps8
mov rax, [rsi + rdx + 8+8]
xor rax, [rdi + rdx + 8+8]
jnz LeaveLoopCmps16
add rdx,8+8+8
jnz short LoopCmps
jmp short LenMaximum
LeaveLoopCmps16: add rdx,8
LeaveLoopCmps8: add rdx,8
LeaveLoopCmps:
test eax, 0000FFFFh
jnz LenLower
test eax,0ffffffffh
jnz LenLower32
add rdx,4
shr rax,32
or ax,ax
jnz LenLower
LenLower32:
shr eax,16
add rdx,2
LenLower: sub al, 1
adc rdx, 0
;;; Calculate the length of the match. If it is longer than MAX_MATCH,
;;; then automatically accept it as the best possible match and leave.
lea rax, [rdi + rdx]
sub rax, r9
cmp eax, MAX_MATCH
jge LenMaximum
;;; If the length of the match is not longer than the best match we
;;; have so far, then forget it and return to the lookup loop.
;///////////////////////////////////
cmp eax, r11d
jg LongerMatch
lea rsi,[r10+r11]
mov rdi, prev_ad
mov edx, [chainlenwmask]
jmp LookupLoop
;;; s->match_start = cur_match;
;;; best_len = len;
;;; if (len >= nice_match) break;
;;; scan_end = *(ushf*)(scan+best_len-1);
LongerMatch:
mov r11d, eax
mov match_start, r8d
cmp eax, [nicematch]
jge LeaveNow
lea rsi,[r10+rax]
movzx ebx, word ptr [r9 + rax - 1]
mov rdi, prev_ad
mov edx, [chainlenwmask]
jmp LookupLoop
;;; Accept the current string, with the maximum possible length.
LenMaximum:
mov r11d,MAX_MATCH
mov match_start, r8d
;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
;;; return s->lookahead;
LeaveNow:
IFDEF INFOZIP
mov eax,r11d
ELSE
mov eax, Lookahead
cmp r11d, eax
cmovng eax, r11d
ENDIF
;;; Restore the stack and return from whence we came.
mov rsi,[save_rsi]
mov rdi,[save_rdi]
mov rbx,[save_rbx]
mov rbp,[save_rbp]
mov r12,[save_r12]
mov r13,[save_r13]
; mov r14,[save_r14]
; mov r15,[save_r15]
ret 0
; please don't remove this string !
; Your can freely use gvmat64 in any free or commercial app
; but it is far better don't remove the string in the binary!
db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
longest_match ENDP
match_init PROC
ret 0
match_init ENDP
END

View File

@ -1,186 +0,0 @@
/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
* version for AMD64 on Windows using Microsoft C compiler
*
* Copyright (C) 1995-2003 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Copyright (C) 2003 Chris Anderson <christop@charm.net>
* Please use the copyright conditions above.
*
* 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
*
* inffas8664.c call function inffas8664fnc in inffasx64.asm
* inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
*
* Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
* slightly quicker on x86 systems because, instead of using rep movsb to copy
* data, it uses rep movsw, which moves data in 2-byte chunks instead of single
* bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
* from http://fedora.linux.duke.edu/fc1_x86_64
* which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
* 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
* when decompressing mozilla-source-1.3.tar.gz.
*
* Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
* the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
* the moment. I have successfully compiled and tested this code with gcc2.96,
* gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
* compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
* enabled. I will attempt to merge the MMX code into this version. Newer
* versions of this and inffast.S can be found at
* http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
*
*/
#include <stdio.h>
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
/* Mark Adler's comments from inffast.c: */
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
available, an end-of-block is encountered, or a data error is encountered.
When large enough input and output buffers are supplied to inflate(), for
example, a 16K input buffer and a 64K output buffer, more than 95% of the
inflate execution time is spent in this routine.
Entry assumptions:
state->mode == LEN
strm->avail_in >= 6
strm->avail_out >= 258
start >= strm->avail_out
state->bits < 8
On return, state->mode is one of:
LEN -- ran out of enough output space or enough available input
TYPE -- reached end of block code, inflate() to interpret next block
BAD -- error in block data
Notes:
- The maximum input bits used by a length/distance pair is 15 bits for the
length code, 5 bits for the length extra, 15 bits for the distance code,
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
Therefore if strm->avail_in >= 6, then there is enough input to avoid
checking for available input while decoding.
- The maximum bytes that a single length/distance pair can output is 258
bytes, which is the maximum length that can be coded. inflate_fast()
requires strm->avail_out >= 258 for each loop to avoid checking for
output space.
*/
typedef struct inffast_ar {
/* 64 32 x86 x86_64 */
/* ar offset register */
/* 0 0 */ void *esp; /* esp save */
/* 8 4 */ void *ebp; /* ebp save */
/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
/* 92 48 */ unsigned wsize; /* window size */
/* 96 52 */ unsigned write; /* window write index */
/*100 56 */ unsigned lmask; /* r12 mask for lcode */
/*104 60 */ unsigned dmask; /* r13 mask for dcode */
/*108 64 */ unsigned len; /* r14 match length */
/*112 68 */ unsigned dist; /* r15 match distance */
/*116 72 */ unsigned status; /* set when state chng*/
} type_ar;
#ifdef ASMINF
void inflate_fast(strm, start)
z_streamp strm;
unsigned start; /* inflate()'s starting value for strm->avail_out */
{
struct inflate_state FAR *state;
type_ar ar;
void inffas8664fnc(struct inffast_ar * par);
#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
#define PAD_AVAIL_IN 6
#define PAD_AVAIL_OUT 258
#else
#define PAD_AVAIL_IN 5
#define PAD_AVAIL_OUT 257
#endif
/* copy state to local variables */
state = (struct inflate_state FAR *)strm->state;
ar.in = strm->next_in;
ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
ar.out = strm->next_out;
ar.beg = ar.out - (start - strm->avail_out);
ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
ar.wsize = state->wsize;
ar.write = state->wnext;
ar.window = state->window;
ar.hold = state->hold;
ar.bits = state->bits;
ar.lcode = state->lencode;
ar.dcode = state->distcode;
ar.lmask = (1U << state->lenbits) - 1;
ar.dmask = (1U << state->distbits) - 1;
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
/* align in on 1/2 hold size boundary */
while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
ar.hold += (unsigned long)*ar.in++ << ar.bits;
ar.bits += 8;
}
inffas8664fnc(&ar);
if (ar.status > 1) {
if (ar.status == 2)
strm->msg = "invalid literal/length code";
else if (ar.status == 3)
strm->msg = "invalid distance code";
else
strm->msg = "invalid distance too far back";
state->mode = BAD;
}
else if ( ar.status == 1 ) {
state->mode = TYPE;
}
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
ar.len = ar.bits >> 3;
ar.in -= ar.len;
ar.bits -= ar.len << 3;
ar.hold &= (1U << ar.bits) - 1;
/* update state and return */
strm->next_in = ar.in;
strm->next_out = ar.out;
strm->avail_in = (unsigned)(ar.in < ar.last ?
PAD_AVAIL_IN + (ar.last - ar.in) :
PAD_AVAIL_IN - (ar.in - ar.last));
strm->avail_out = (unsigned)(ar.out < ar.end ?
PAD_AVAIL_OUT + (ar.end - ar.out) :
PAD_AVAIL_OUT - (ar.out - ar.end));
state->hold = (unsigned long)ar.hold;
state->bits = ar.bits;
return;
}
#endif

View File

@ -1,396 +0,0 @@
; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
; version for AMD64 on Windows using Microsoft C compiler
;
; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
; inffasx64.asm is called by inffas8664.c, which contain more info.
; to compile this file, I use option
; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
; with Microsoft Macro Assembler (x64) for AMD64
;
; This file compile with Microsoft Macro Assembler (x64) for AMD64
;
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
;
; (you can get Windows WDK with ml64 for AMD64 from
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
;
.code
inffas8664fnc PROC
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
;
; All registers must be preserved across the call, except for
; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
mov [rsp-8],rsi
mov [rsp-16],rdi
mov [rsp-24],r12
mov [rsp-32],r13
mov [rsp-40],r14
mov [rsp-48],r15
mov [rsp-56],rbx
mov rax,rcx
mov [rax+8], rbp ; /* save regs rbp and rsp */
mov [rax], rsp
mov rsp, rax ; /* make rsp point to &ar */
mov rsi, [rsp+16] ; /* rsi = in */
mov rdi, [rsp+32] ; /* rdi = out */
mov r9, [rsp+24] ; /* r9 = last */
mov r10, [rsp+48] ; /* r10 = end */
mov rbp, [rsp+64] ; /* rbp = lcode */
mov r11, [rsp+72] ; /* r11 = dcode */
mov rdx, [rsp+80] ; /* rdx = hold */
mov ebx, [rsp+88] ; /* ebx = bits */
mov r12d, [rsp+100] ; /* r12d = lmask */
mov r13d, [rsp+104] ; /* r13d = dmask */
; /* r14d = len */
; /* r15d = dist */
cld
cmp r10, rdi
je L_one_time ; /* if only one decode left */
cmp r9, rsi
jne L_do_loop
L_one_time:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code_one_time
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
jmp L_get_length_code_one_time
ALIGN 4
L_while_test:
cmp r10, rdi
jbe L_break_loop
cmp r9, rsi
jbe L_break_loop
L_do_loop:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_length_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
mov r8, r12 ; /* r8 = lmask */
shr eax, 16 ; /* output this.val char */
stosb
L_get_length_code_one_time:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
L_dolen:
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
shr eax, 16 ; /* output this.val char */
stosb
jmp L_while_test
ALIGN 4
L_test_for_length_base:
mov r14d, eax ; /* len = this */
shr r14d, 16 ; /* len = this.val */
mov cl, al
test al, 16
jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
and cl, 15 ; /* op &= 15 */
jz L_decode_distance ; /* if (!op) */
L_add_bits_to_len:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r14d, eax ; /* len += hold & mask[op] */
L_decode_distance:
mov r8, r13 ; /* r8 = dmask */
cmp bl, 32
ja L_get_distance_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_distance_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
L_dodist:
mov r15d, eax ; /* dist = this */
shr r15d, 16 ; /* dist = this.val */
mov cl, ah
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
mov cl, al ; /* cl = this.op */
test al, 16 ; /* if ((op & 16) == 0) */
jz L_test_for_second_level_dist
and cl, 15 ; /* op &= 15 */
jz L_check_dist_one
L_add_bits_to_dist:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax ; /* (1 << op) - 1 */
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
L_check_window:
mov r8, rsi ; /* save in so from can use it's reg */
mov rax, rdi
sub rax, [rsp+40] ; /* nbytes = out - beg */
cmp eax, r15d
jb L_clip_window ; /* if (dist > nbytes) 4.2% */
mov ecx, r14d ; /* ecx = len */
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
sar ecx, 1
jnc L_copy_two ; /* if len % 2 == 0 */
rep movsw
mov al, [rsi]
mov [rdi], al
inc rdi
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
L_copy_two:
rep movsw
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
ALIGN 4
L_check_dist_one:
cmp r15d, 1 ; /* if dist 1, is a memset */
jne L_check_window
cmp [rsp+40], rdi ; /* if out == beg, outside window */
je L_check_window
mov ecx, r14d ; /* ecx = len */
mov al, [rdi-1]
mov ah, al
sar ecx, 1
jnc L_set_two
mov [rdi], al
inc rdi
L_set_two:
rep stosw
jmp L_while_test
ALIGN 4
L_test_for_second_level_length:
test al, 64
jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r14d ; /* eax += len */
mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
jmp L_dolen
ALIGN 4
L_test_for_second_level_dist:
test al, 64
jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r15d ; /* eax += dist */
mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
jmp L_dodist
ALIGN 4
L_clip_window:
mov ecx, eax ; /* ecx = nbytes */
mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
neg ecx ; /* nbytes = -nbytes */
cmp eax, r15d
jb L_invalid_distance_too_far ; /* if (dist > wsize) */
add ecx, r15d ; /* nbytes = dist - nbytes */
cmp dword ptr [rsp+96], 0
jne L_wrap_around_window ; /* if (write != 0) */
mov rsi, [rsp+56] ; /* from = window */
sub eax, ecx ; /* eax -= nbytes */
add rsi, rax ; /* from += wsize - nbytes */
mov eax, r14d ; /* eax = len */
cmp r14d, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* eax -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = &out[ -dist ] */
jmp L_do_copy
ALIGN 4
L_wrap_around_window:
mov eax, [rsp+96] ; /* eax = write */
cmp ecx, eax
jbe L_contiguous_in_window ; /* if (write >= nbytes) */
mov esi, [rsp+92] ; /* from = wsize */
add rsi, [rsp+56] ; /* from += window */
add rsi, rax ; /* from += write */
sub rsi, rcx ; /* from -= nbytes */
sub ecx, eax ; /* nbytes -= write */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, [rsp+56] ; /* from = window */
mov ecx, [rsp+96] ; /* nbytes = write */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy
ALIGN 4
L_contiguous_in_window:
mov rsi, [rsp+56] ; /* rsi = window */
add rsi, rax
sub rsi, rcx ; /* from += write - nbytes */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy ; /* if (nbytes >= len) */
ALIGN 4
L_do_copy:
mov ecx, eax ; /* ecx = len */
rep movsb
mov rsi, r8 ; /* move in back to %esi, toss from */
jmp L_while_test
L_test_for_end_of_block:
test al, 32
jz L_invalid_literal_length_code
mov dword ptr [rsp+116], 1
jmp L_break_loop_with_status
L_invalid_literal_length_code:
mov dword ptr [rsp+116], 2
jmp L_break_loop_with_status
L_invalid_distance_code:
mov dword ptr [rsp+116], 3
jmp L_break_loop_with_status
L_invalid_distance_too_far:
mov dword ptr [rsp+116], 4
jmp L_break_loop_with_status
L_break_loop:
mov dword ptr [rsp+116], 0
L_break_loop_with_status:
; /* put in, out, bits, and hold back into ar and pop esp */
mov [rsp+16], rsi ; /* in */
mov [rsp+32], rdi ; /* out */
mov [rsp+88], ebx ; /* bits */
mov [rsp+80], rdx ; /* hold */
mov rax, [rsp] ; /* restore rbp and rsp */
mov rbp, [rsp+8]
mov rsp, rax
mov rsi,[rsp-8]
mov rdi,[rsp-16]
mov r12,[rsp-24]
mov r13,[rsp-32]
mov r14,[rsp-40]
mov r15,[rsp-48]
mov rbx,[rsp-56]
ret 0
; :
; : "m" (ar)
; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
; );
inffas8664fnc ENDP
;_TEXT ENDS
END

View File

@ -1,31 +0,0 @@
Summary
-------
This directory contains ASM implementations of the functions
longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
assembly optimized version from Jean-loup Gailly original longest_match function
inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
original function from Mark Adler
Use instructions
----------------
Assemble the .asm files using MASM and put the object files into the zlib source
directory. You can also get object files here:
http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
and inffasx64.obj and gvmat64.obj as object to link.
Build instructions
------------------
run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)

View File

@ -1,2 +0,0 @@
ml /coff /Zi /c /Flmatch686.lst match686.asm
ml /coff /Zi /c /Flinffas32.lst inffas32.asm

File diff suppressed because it is too large Load Diff

View File

@ -1,479 +0,0 @@
; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86
; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
; File written by Gilles Vollant, by converting match686.S from Brian Raiter
; for MASM. This is as assembly version of longest_match
; from Jean-loup Gailly in deflate.c
;
; http://www.zlib.net
; http://www.winimage.com/zLibDll
; http://www.muppetlabs.com/~breadbox/software/assembly.html
;
; For Visual C++ 4.x and higher and ML 6.x and higher
; ml.exe is distributed in
; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
;
; this file contain two implementation of longest_match
;
; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro
; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom)
;
; for using an assembly version of longest_match, you need define ASMV in project
;
; compile the asm file running
; ml /coff /Zi /c /Flmatch686.lst match686.asm
; and do not include match686.obj in your project
;
; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for
; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor
; with autoselect (with cpu detection code)
; if you want support the old pentium optimization, you can still use these version
;
; this file is not optimized for old pentium, but it compatible with all x86 32 bits
; processor (starting 80386)
;
;
; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
;uInt longest_match(s, cur_match)
; deflate_state *s;
; IPos cur_match; /* current match */
NbStack equ 76
cur_match equ dword ptr[esp+NbStack-0]
str_s equ dword ptr[esp+NbStack-4]
; 5 dword on top (ret,ebp,esi,edi,ebx)
adrret equ dword ptr[esp+NbStack-8]
pushebp equ dword ptr[esp+NbStack-12]
pushedi equ dword ptr[esp+NbStack-16]
pushesi equ dword ptr[esp+NbStack-20]
pushebx equ dword ptr[esp+NbStack-24]
chain_length equ dword ptr [esp+NbStack-28]
limit equ dword ptr [esp+NbStack-32]
best_len equ dword ptr [esp+NbStack-36]
window equ dword ptr [esp+NbStack-40]
prev equ dword ptr [esp+NbStack-44]
scan_start equ word ptr [esp+NbStack-48]
wmask equ dword ptr [esp+NbStack-52]
match_start_ptr equ dword ptr [esp+NbStack-56]
nice_match equ dword ptr [esp+NbStack-60]
scan equ dword ptr [esp+NbStack-64]
windowlen equ dword ptr [esp+NbStack-68]
match_start equ dword ptr [esp+NbStack-72]
strend equ dword ptr [esp+NbStack-76]
NbStackAdd equ (NbStack-24)
.386p
name gvmatch
.MODEL FLAT
; all the +zlib1222add offsets are due to the addition of fields
; in zlib in the deflate_state structure since the asm code was first written
; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
zlib1222add equ 8
; Note : these value are good with a 8 bytes boundary pack structure
dep_chain_length equ 74h+zlib1222add
dep_window equ 30h+zlib1222add
dep_strstart equ 64h+zlib1222add
dep_prev_length equ 70h+zlib1222add
dep_nice_match equ 88h+zlib1222add
dep_w_size equ 24h+zlib1222add
dep_prev equ 38h+zlib1222add
dep_w_mask equ 2ch+zlib1222add
dep_good_match equ 84h+zlib1222add
dep_match_start equ 68h+zlib1222add
dep_lookahead equ 6ch+zlib1222add
_TEXT segment
IFDEF NOUNDERLINE
public longest_match
public match_init
ELSE
public _longest_match
public _match_init
ENDIF
MAX_MATCH equ 258
MIN_MATCH equ 3
MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
MAX_MATCH equ 258
MIN_MATCH equ 3
MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
;;; stack frame offsets
chainlenwmask equ esp + 0 ; high word: current chain len
; low word: s->wmask
window equ esp + 4 ; local copy of s->window
windowbestlen equ esp + 8 ; s->window + bestlen
scanstart equ esp + 16 ; first two bytes of string
scanend equ esp + 12 ; last two bytes of string
scanalign equ esp + 20 ; dword-misalignment of string
nicematch equ esp + 24 ; a good enough match size
bestlen equ esp + 28 ; size of best match so far
scan equ esp + 32 ; ptr to string wanting match
LocalVarsSize equ 36
; saved ebx byte esp + 36
; saved edi byte esp + 40
; saved esi byte esp + 44
; saved ebp byte esp + 48
; return address byte esp + 52
deflatestate equ esp + 56 ; the function arguments
curmatch equ esp + 60
;;; Offsets for fields in the deflate_state structure. These numbers
;;; are calculated from the definition of deflate_state, with the
;;; assumption that the compiler will dword-align the fields. (Thus,
;;; changing the definition of deflate_state could easily cause this
;;; program to crash horribly, without so much as a warning at
;;; compile time. Sigh.)
dsWSize equ 36+zlib1222add
dsWMask equ 44+zlib1222add
dsWindow equ 48+zlib1222add
dsPrev equ 56+zlib1222add
dsMatchLen equ 88+zlib1222add
dsPrevMatch equ 92+zlib1222add
dsStrStart equ 100+zlib1222add
dsMatchStart equ 104+zlib1222add
dsLookahead equ 108+zlib1222add
dsPrevLen equ 112+zlib1222add
dsMaxChainLen equ 116+zlib1222add
dsGoodMatch equ 132+zlib1222add
dsNiceMatch equ 136+zlib1222add
;;; match686.asm -- Pentium-Pro-optimized version of longest_match()
;;; Written for zlib 1.1.2
;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
;;;
;;
;; This software is provided 'as-is', without any express or implied
;; warranty. In no event will the authors be held liable for any damages
;; arising from the use of this software.
;;
;; Permission is granted to anyone to use this software for any purpose,
;; including commercial applications, and to alter it and redistribute it
;; freely, subject to the following restrictions:
;;
;; 1. The origin of this software must not be misrepresented; you must not
;; claim that you wrote the original software. If you use this software
;; in a product, an acknowledgment in the product documentation would be
;; appreciated but is not required.
;; 2. Altered source versions must be plainly marked as such, and must not be
;; misrepresented as being the original software
;; 3. This notice may not be removed or altered from any source distribution.
;;
;GLOBAL _longest_match, _match_init
;SECTION .text
;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
;_longest_match:
IFDEF NOUNDERLINE
longest_match proc near
ELSE
_longest_match proc near
ENDIF
.FPO (9, 4, 0, 0, 1, 0)
;;; Save registers that the compiler may be using, and adjust esp to
;;; make room for our stack frame.
push ebp
push edi
push esi
push ebx
sub esp, LocalVarsSize
;;; Retrieve the function arguments. ecx will hold cur_match
;;; throughout the entire function. edx will hold the pointer to the
;;; deflate_state structure during the function's setup (before
;;; entering the main loop.
mov edx, [deflatestate]
mov ecx, [curmatch]
;;; uInt wmask = s->w_mask;
;;; unsigned chain_length = s->max_chain_length;
;;; if (s->prev_length >= s->good_match) {
;;; chain_length >>= 2;
;;; }
mov eax, [edx + dsPrevLen]
mov ebx, [edx + dsGoodMatch]
cmp eax, ebx
mov eax, [edx + dsWMask]
mov ebx, [edx + dsMaxChainLen]
jl LastMatchGood
shr ebx, 2
LastMatchGood:
;;; chainlen is decremented once beforehand so that the function can
;;; use the sign flag instead of the zero flag for the exit test.
;;; It is then shifted into the high word, to make room for the wmask
;;; value, which it will always accompany.
dec ebx
shl ebx, 16
or ebx, eax
mov [chainlenwmask], ebx
;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
mov eax, [edx + dsNiceMatch]
mov ebx, [edx + dsLookahead]
cmp ebx, eax
jl LookaheadLess
mov ebx, eax
LookaheadLess: mov [nicematch], ebx
;;; register Bytef *scan = s->window + s->strstart;
mov esi, [edx + dsWindow]
mov [window], esi
mov ebp, [edx + dsStrStart]
lea edi, [esi + ebp]
mov [scan], edi
;;; Determine how many bytes the scan ptr is off from being
;;; dword-aligned.
mov eax, edi
neg eax
and eax, 3
mov [scanalign], eax
;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
mov eax, [edx + dsWSize]
sub eax, MIN_LOOKAHEAD
sub ebp, eax
jg LimitPositive
xor ebp, ebp
LimitPositive:
;;; int best_len = s->prev_length;
mov eax, [edx + dsPrevLen]
mov [bestlen], eax
;;; Store the sum of s->window + best_len in esi locally, and in esi.
add esi, eax
mov [windowbestlen], esi
;;; register ush scan_start = *(ushf*)scan;
;;; register ush scan_end = *(ushf*)(scan+best_len-1);
;;; Posf *prev = s->prev;
movzx ebx, word ptr [edi]
mov [scanstart], ebx
movzx ebx, word ptr [edi + eax - 1]
mov [scanend], ebx
mov edi, [edx + dsPrev]
;;; Jump into the main loop.
mov edx, [chainlenwmask]
jmp short LoopEntry
align 4
;;; do {
;;; match = s->window + cur_match;
;;; if (*(ushf*)(match+best_len-1) != scan_end ||
;;; *(ushf*)match != scan_start) continue;
;;; [...]
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
;;; && --chain_length != 0);
;;;
;;; Here is the inner loop of the function. The function will spend the
;;; majority of its time in this loop, and majority of that time will
;;; be spent in the first ten instructions.
;;;
;;; Within this loop:
;;; ebx = scanend
;;; ecx = curmatch
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
;;; esi = windowbestlen - i.e., (window + bestlen)
;;; edi = prev
;;; ebp = limit
LookupLoop:
and ecx, edx
movzx ecx, word ptr [edi + ecx*2]
cmp ecx, ebp
jbe LeaveNow
sub edx, 00010000h
js LeaveNow
LoopEntry: movzx eax, word ptr [esi + ecx - 1]
cmp eax, ebx
jnz LookupLoop
mov eax, [window]
movzx eax, word ptr [eax + ecx]
cmp eax, [scanstart]
jnz LookupLoop
;;; Store the current value of chainlen.
mov [chainlenwmask], edx
;;; Point edi to the string under scrutiny, and esi to the string we
;;; are hoping to match it up with. In actuality, esi and edi are
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
;;; initialized to -(MAX_MATCH_8 - scanalign).
mov esi, [window]
mov edi, [scan]
add esi, ecx
mov eax, [scanalign]
mov edx, 0fffffef8h; -(MAX_MATCH_8)
lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
;;; Test the strings for equality, 8 bytes at a time. At the end,
;;; adjust edx so that it is offset to the exact byte that mismatched.
;;;
;;; We already know at this point that the first three bytes of the
;;; strings match each other, and they can be safely passed over before
;;; starting the compare loop. So what this code does is skip over 0-3
;;; bytes, as much as necessary in order to dword-align the edi
;;; pointer. (esi will still be misaligned three times out of four.)
;;;
;;; It should be confessed that this loop usually does not represent
;;; much of the total running time. Replacing it with a more
;;; straightforward "rep cmpsb" would not drastically degrade
;;; performance.
LoopCmps:
mov eax, [esi + edx]
xor eax, [edi + edx]
jnz LeaveLoopCmps
mov eax, [esi + edx + 4]
xor eax, [edi + edx + 4]
jnz LeaveLoopCmps4
add edx, 8
jnz LoopCmps
jmp short LenMaximum
LeaveLoopCmps4: add edx, 4
LeaveLoopCmps: test eax, 0000FFFFh
jnz LenLower
add edx, 2
shr eax, 16
LenLower: sub al, 1
adc edx, 0
;;; Calculate the length of the match. If it is longer than MAX_MATCH,
;;; then automatically accept it as the best possible match and leave.
lea eax, [edi + edx]
mov edi, [scan]
sub eax, edi
cmp eax, MAX_MATCH
jge LenMaximum
;;; If the length of the match is not longer than the best match we
;;; have so far, then forget it and return to the lookup loop.
mov edx, [deflatestate]
mov ebx, [bestlen]
cmp eax, ebx
jg LongerMatch
mov esi, [windowbestlen]
mov edi, [edx + dsPrev]
mov ebx, [scanend]
mov edx, [chainlenwmask]
jmp LookupLoop
;;; s->match_start = cur_match;
;;; best_len = len;
;;; if (len >= nice_match) break;
;;; scan_end = *(ushf*)(scan+best_len-1);
LongerMatch: mov ebx, [nicematch]
mov [bestlen], eax
mov [edx + dsMatchStart], ecx
cmp eax, ebx
jge LeaveNow
mov esi, [window]
add esi, eax
mov [windowbestlen], esi
movzx ebx, word ptr [edi + eax - 1]
mov edi, [edx + dsPrev]
mov [scanend], ebx
mov edx, [chainlenwmask]
jmp LookupLoop
;;; Accept the current string, with the maximum possible length.
LenMaximum: mov edx, [deflatestate]
mov dword ptr [bestlen], MAX_MATCH
mov [edx + dsMatchStart], ecx
;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
;;; return s->lookahead;
LeaveNow:
mov edx, [deflatestate]
mov ebx, [bestlen]
mov eax, [edx + dsLookahead]
cmp ebx, eax
jg LookaheadRet
mov eax, ebx
LookaheadRet:
;;; Restore the stack and return from whence we came.
add esp, LocalVarsSize
pop ebx
pop esi
pop edi
pop ebp
ret
; please don't remove this string !
; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary!
db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
IFDEF NOUNDERLINE
longest_match endp
ELSE
_longest_match endp
ENDIF
IFDEF NOUNDERLINE
match_init proc near
ret
match_init endp
ELSE
_match_init proc near
ret
_match_init endp
ENDIF
_TEXT ends
end

View File

@ -1,27 +0,0 @@
Summary
-------
This directory contains ASM implementations of the functions
longest_match() and inflate_fast().
Use instructions
----------------
Assemble using MASM, and copy the object files into the zlib source
directory, then run the appropriate makefile, as suggested below. You can
donwload MASM from here:
http://www.microsoft.com/downloads/details.aspx?displaylang=en&FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
You can also get objects files here:
http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
Build instructions
------------------
* With Microsoft C and MASM:
nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj"
* With Borland C and TASM:
make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" OBJPA="+match686c.obj+match686.obj+inffas32.obj"

View File

@ -3,7 +3,6 @@
#
# Usage:
# make -f win32/Makefile.bor
# make -f win32/Makefile.bor LOCAL_ZLIB=-DASMV OBJA=match.obj OBJPA=+match.obj
# ------------ Borland C++ ------------

View File

@ -11,10 +11,6 @@
#
# make -fwin32/Makefile.gcc; make test testdll -fwin32/Makefile.gcc
#
# To use the asm code, type:
# cp contrib/asm?86/match.S ./match.S
# make LOC=-DASMV OBJA=match.o -fwin32/Makefile.gcc
#
# To install libz.a, zconf.h and zlib.h in the system directories, type:
#
# make install -fwin32/Makefile.gcc
@ -38,7 +34,6 @@ IMPLIB = libz.dll.a
#
SHARED_MODE=0
#LOC = -DASMV
#LOC = -DZLIB_DEBUG -g
PREFIX =

View File

@ -4,10 +4,6 @@
# Usage:
# nmake -f win32/Makefile.msc (standard build)
# nmake -f win32/Makefile.msc LOC=-DFOO (nonstandard build)
# nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" \
# OBJA="inffas32.obj match686.obj" (use ASM code, x86)
# nmake -f win32/Makefile.msc AS=ml64 LOC="-DASMV -DASMINF -I." \
# OBJA="inffasx64.obj gvmat64.obj inffas8664.obj" (use ASM code, x64)
# The toplevel directory of the source tree.
#