mirror of https://github.com/madler/zlib
Remove old assembler code in which bugs have manifested.
In addition, there is not sufficient gain from the inflate assembler code to warrant its inclusion.
This commit is contained in:
parent
a577351394
commit
288f108031
|
@ -8,14 +8,6 @@ ada/ by Dmitriy Anisimkov <anisimkov@yahoo.com>
|
|||
Support for Ada
|
||||
See http://zlib-ada.sourceforge.net/
|
||||
|
||||
amd64/ by Mikhail Teterin <mi@ALDAN.algebra.com>
|
||||
asm code for AMD64
|
||||
See patch at http://www.freebsd.org/cgi/query-pr.cgi?pr=bin/96393
|
||||
|
||||
asm686/ by Brian Raiter <breadbox@muppetlabs.com>
|
||||
asm code for Pentium and PPro/PII, using the AT&T (GNU as) syntax
|
||||
See http://www.muppetlabs.com/~breadbox/software/assembly.html
|
||||
|
||||
blast/ by Mark Adler <madler@alumni.caltech.edu>
|
||||
Decompressor for output of PKWare Data Compression Library (DCL)
|
||||
|
||||
|
@ -32,9 +24,6 @@ gcc_gvmat64/by Gilles Vollant <info@winimage.com>
|
|||
infback9/ by Mark Adler <madler@alumni.caltech.edu>
|
||||
Unsupported diffs to infback to decode the deflate64 format
|
||||
|
||||
inflate86/ by Chris Anderson <christop@charm.net>
|
||||
Tuned x86 gcc asm code to replace inflate_fast()
|
||||
|
||||
iostream/ by Kevin Ruland <kevin@rodin.wustl.edu>
|
||||
A C++ I/O streams interface to the zlib gz* functions
|
||||
|
||||
|
@ -45,16 +34,6 @@ iostream3/ by Ludwig Schwardt <schwardt@sun.ac.za>
|
|||
and Kevin Ruland <kevin@rodin.wustl.edu>
|
||||
Yet another C++ I/O streams interface
|
||||
|
||||
masmx64/ by Gilles Vollant <info@winimage.com>
|
||||
x86 64-bit (AMD64 and Intel EM64t) code for x64 assembler to
|
||||
replace longest_match() and inflate_fast(), also masm x86
|
||||
64-bits translation of Chris Anderson inflate_fast()
|
||||
|
||||
masmx86/ by Gilles Vollant <info@winimage.com>
|
||||
x86 asm code to replace longest_match() and inflate_fast(),
|
||||
for Visual C++ and MASM (32 bits).
|
||||
Based on Brian Raiter (asm686) and Chris Anderson (inflate86)
|
||||
|
||||
minizip/ by Gilles Vollant <info@winimage.com>
|
||||
Mini zip and unzip based on zlib
|
||||
Includes Zip64 support by Mathias Svensson <mathias@result42.com>
|
||||
|
|
|
@ -1,452 +0,0 @@
|
|||
/*
|
||||
* match.S -- optimized version of longest_match()
|
||||
* based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
|
||||
*
|
||||
* This is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the BSD License. Use by owners of Che Guevarra
|
||||
* parafernalia is prohibited, where possible, and highly discouraged
|
||||
* elsewhere.
|
||||
*/
|
||||
|
||||
#ifndef NO_UNDERLINE
|
||||
# define match_init _match_init
|
||||
# define longest_match _longest_match
|
||||
#endif
|
||||
|
||||
#define scanend ebx
|
||||
#define scanendw bx
|
||||
#define chainlenwmask edx /* high word: current chain len low word: s->wmask */
|
||||
#define curmatch rsi
|
||||
#define curmatchd esi
|
||||
#define windowbestlen r8
|
||||
#define scanalign r9
|
||||
#define scanalignd r9d
|
||||
#define window r10
|
||||
#define bestlen r11
|
||||
#define bestlend r11d
|
||||
#define scanstart r12d
|
||||
#define scanstartw r12w
|
||||
#define scan r13
|
||||
#define nicematch r14d
|
||||
#define limit r15
|
||||
#define limitd r15d
|
||||
#define prev rcx
|
||||
|
||||
/*
|
||||
* The 258 is a "magic number, not a parameter -- changing it
|
||||
* breaks the hell loose
|
||||
*/
|
||||
#define MAX_MATCH (258)
|
||||
#define MIN_MATCH (3)
|
||||
#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
|
||||
#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
|
||||
|
||||
/* stack frame offsets */
|
||||
#define LocalVarsSize (112)
|
||||
#define _chainlenwmask ( 8-LocalVarsSize)(%rsp)
|
||||
#define _windowbestlen (16-LocalVarsSize)(%rsp)
|
||||
#define save_r14 (24-LocalVarsSize)(%rsp)
|
||||
#define save_rsi (32-LocalVarsSize)(%rsp)
|
||||
#define save_rbx (40-LocalVarsSize)(%rsp)
|
||||
#define save_r12 (56-LocalVarsSize)(%rsp)
|
||||
#define save_r13 (64-LocalVarsSize)(%rsp)
|
||||
#define save_r15 (80-LocalVarsSize)(%rsp)
|
||||
|
||||
|
||||
.globl match_init, longest_match
|
||||
|
||||
/*
|
||||
* On AMD64 the first argument of a function (in our case -- the pointer to
|
||||
* deflate_state structure) is passed in %rdi, hence our offsets below are
|
||||
* all off of that.
|
||||
*/
|
||||
|
||||
/* you can check the structure offset by running
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "deflate.h"
|
||||
|
||||
void print_depl()
|
||||
{
|
||||
deflate_state ds;
|
||||
deflate_state *s=&ds;
|
||||
printf("size pointer=%u\n",(int)sizeof(void*));
|
||||
|
||||
printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
|
||||
printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
|
||||
printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
|
||||
printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
|
||||
printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
|
||||
printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
|
||||
printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
|
||||
printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
|
||||
printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
|
||||
printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
|
||||
printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
|
||||
printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
|
||||
printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
to compile for XCode 3.2 on MacOSX x86_64
|
||||
- run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
|
||||
*/
|
||||
|
||||
|
||||
#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
|
||||
#define dsWSize ( 68)(%rdi)
|
||||
#define dsWMask ( 76)(%rdi)
|
||||
#define dsWindow ( 80)(%rdi)
|
||||
#define dsPrev ( 96)(%rdi)
|
||||
#define dsMatchLen (144)(%rdi)
|
||||
#define dsPrevMatch (148)(%rdi)
|
||||
#define dsStrStart (156)(%rdi)
|
||||
#define dsMatchStart (160)(%rdi)
|
||||
#define dsLookahead (164)(%rdi)
|
||||
#define dsPrevLen (168)(%rdi)
|
||||
#define dsMaxChainLen (172)(%rdi)
|
||||
#define dsGoodMatch (188)(%rdi)
|
||||
#define dsNiceMatch (192)(%rdi)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef STRUCT_OFFSET
|
||||
# define STRUCT_OFFSET (0)
|
||||
#endif
|
||||
|
||||
|
||||
#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsPrev ( 88 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsMatchLen (136 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsStrStart (148 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsMatchStart (152 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsLookahead (156 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsPrevLen (160 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsMaxChainLen (164 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
|
||||
#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
.text
|
||||
|
||||
/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
|
||||
|
||||
longest_match:
|
||||
/*
|
||||
* Retrieve the function arguments. %curmatch will hold cur_match
|
||||
* throughout the entire function (passed via rsi on amd64).
|
||||
* rdi will hold the pointer to the deflate_state (first arg on amd64)
|
||||
*/
|
||||
mov %rsi, save_rsi
|
||||
mov %rbx, save_rbx
|
||||
mov %r12, save_r12
|
||||
mov %r13, save_r13
|
||||
mov %r14, save_r14
|
||||
mov %r15, save_r15
|
||||
|
||||
/* uInt wmask = s->w_mask; */
|
||||
/* unsigned chain_length = s->max_chain_length; */
|
||||
/* if (s->prev_length >= s->good_match) { */
|
||||
/* chain_length >>= 2; */
|
||||
/* } */
|
||||
|
||||
movl dsPrevLen, %eax
|
||||
movl dsGoodMatch, %ebx
|
||||
cmpl %ebx, %eax
|
||||
movl dsWMask, %eax
|
||||
movl dsMaxChainLen, %chainlenwmask
|
||||
jl LastMatchGood
|
||||
shrl $2, %chainlenwmask
|
||||
LastMatchGood:
|
||||
|
||||
/* chainlen is decremented once beforehand so that the function can */
|
||||
/* use the sign flag instead of the zero flag for the exit test. */
|
||||
/* It is then shifted into the high word, to make room for the wmask */
|
||||
/* value, which it will always accompany. */
|
||||
|
||||
decl %chainlenwmask
|
||||
shll $16, %chainlenwmask
|
||||
orl %eax, %chainlenwmask
|
||||
|
||||
/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
|
||||
|
||||
movl dsNiceMatch, %eax
|
||||
movl dsLookahead, %ebx
|
||||
cmpl %eax, %ebx
|
||||
jl LookaheadLess
|
||||
movl %eax, %ebx
|
||||
LookaheadLess: movl %ebx, %nicematch
|
||||
|
||||
/* register Bytef *scan = s->window + s->strstart; */
|
||||
|
||||
mov dsWindow, %window
|
||||
movl dsStrStart, %limitd
|
||||
lea (%limit, %window), %scan
|
||||
|
||||
/* Determine how many bytes the scan ptr is off from being */
|
||||
/* dword-aligned. */
|
||||
|
||||
mov %scan, %scanalign
|
||||
negl %scanalignd
|
||||
andl $3, %scanalignd
|
||||
|
||||
/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
|
||||
/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
|
||||
|
||||
movl dsWSize, %eax
|
||||
subl $MIN_LOOKAHEAD, %eax
|
||||
xorl %ecx, %ecx
|
||||
subl %eax, %limitd
|
||||
cmovng %ecx, %limitd
|
||||
|
||||
/* int best_len = s->prev_length; */
|
||||
|
||||
movl dsPrevLen, %bestlend
|
||||
|
||||
/* Store the sum of s->window + best_len in %windowbestlen locally, and in memory. */
|
||||
|
||||
lea (%window, %bestlen), %windowbestlen
|
||||
mov %windowbestlen, _windowbestlen
|
||||
|
||||
/* register ush scan_start = *(ushf*)scan; */
|
||||
/* register ush scan_end = *(ushf*)(scan+best_len-1); */
|
||||
/* Posf *prev = s->prev; */
|
||||
|
||||
movzwl (%scan), %scanstart
|
||||
movzwl -1(%scan, %bestlen), %scanend
|
||||
mov dsPrev, %prev
|
||||
|
||||
/* Jump into the main loop. */
|
||||
|
||||
movl %chainlenwmask, _chainlenwmask
|
||||
jmp LoopEntry
|
||||
|
||||
.balign 16
|
||||
|
||||
/* do {
|
||||
* match = s->window + cur_match;
|
||||
* if (*(ushf*)(match+best_len-1) != scan_end ||
|
||||
* *(ushf*)match != scan_start) continue;
|
||||
* [...]
|
||||
* } while ((cur_match = prev[cur_match & wmask]) > limit
|
||||
* && --chain_length != 0);
|
||||
*
|
||||
* Here is the inner loop of the function. The function will spend the
|
||||
* majority of its time in this loop, and majority of that time will
|
||||
* be spent in the first ten instructions.
|
||||
*/
|
||||
LookupLoop:
|
||||
andl %chainlenwmask, %curmatchd
|
||||
movzwl (%prev, %curmatch, 2), %curmatchd
|
||||
cmpl %limitd, %curmatchd
|
||||
jbe LeaveNow
|
||||
subl $0x00010000, %chainlenwmask
|
||||
js LeaveNow
|
||||
LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
|
||||
jne LookupLoop
|
||||
cmpw %scanstartw, (%window, %curmatch)
|
||||
jne LookupLoop
|
||||
|
||||
/* Store the current value of chainlen. */
|
||||
movl %chainlenwmask, _chainlenwmask
|
||||
|
||||
/* %scan is the string under scrutiny, and %prev to the string we */
|
||||
/* are hoping to match it up with. In actuality, %esi and %edi are */
|
||||
/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
|
||||
/* initialized to -(MAX_MATCH_8 - scanalign). */
|
||||
|
||||
mov $(-MAX_MATCH_8), %rdx
|
||||
lea (%curmatch, %window), %windowbestlen
|
||||
lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
|
||||
lea MAX_MATCH_8(%scan, %scanalign), %prev
|
||||
|
||||
/* the prefetching below makes very little difference... */
|
||||
prefetcht1 (%windowbestlen, %rdx)
|
||||
prefetcht1 (%prev, %rdx)
|
||||
|
||||
/*
|
||||
* Test the strings for equality, 8 bytes at a time. At the end,
|
||||
* adjust %rdx so that it is offset to the exact byte that mismatched.
|
||||
*
|
||||
* It should be confessed that this loop usually does not represent
|
||||
* much of the total running time. Replacing it with a more
|
||||
* straightforward "rep cmpsb" would not drastically degrade
|
||||
* performance -- unrolling it, for example, makes no difference.
|
||||
*/
|
||||
|
||||
#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
|
||||
|
||||
LoopCmps:
|
||||
#ifdef USE_SSE
|
||||
/* Preload the SSE registers */
|
||||
movdqu (%windowbestlen, %rdx), %xmm1
|
||||
movdqu (%prev, %rdx), %xmm2
|
||||
pcmpeqb %xmm2, %xmm1
|
||||
movdqu 16(%windowbestlen, %rdx), %xmm3
|
||||
movdqu 16(%prev, %rdx), %xmm4
|
||||
pcmpeqb %xmm4, %xmm3
|
||||
movdqu 32(%windowbestlen, %rdx), %xmm5
|
||||
movdqu 32(%prev, %rdx), %xmm6
|
||||
pcmpeqb %xmm6, %xmm5
|
||||
movdqu 48(%windowbestlen, %rdx), %xmm7
|
||||
movdqu 48(%prev, %rdx), %xmm8
|
||||
pcmpeqb %xmm8, %xmm7
|
||||
|
||||
/* Check the comparisions' results */
|
||||
pmovmskb %xmm1, %rax
|
||||
notw %ax
|
||||
bsfw %ax, %ax
|
||||
jnz LeaveLoopCmps
|
||||
|
||||
/* this is the only iteration of the loop with a possibility of having
|
||||
incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40
|
||||
and (0x40*4)+8=0x108 */
|
||||
add $8, %rdx
|
||||
jz LenMaximum
|
||||
add $8, %rdx
|
||||
|
||||
|
||||
pmovmskb %xmm3, %rax
|
||||
notw %ax
|
||||
bsfw %ax, %ax
|
||||
jnz LeaveLoopCmps
|
||||
|
||||
|
||||
add $16, %rdx
|
||||
|
||||
|
||||
pmovmskb %xmm5, %rax
|
||||
notw %ax
|
||||
bsfw %ax, %ax
|
||||
jnz LeaveLoopCmps
|
||||
|
||||
add $16, %rdx
|
||||
|
||||
|
||||
pmovmskb %xmm7, %rax
|
||||
notw %ax
|
||||
bsfw %ax, %ax
|
||||
jnz LeaveLoopCmps
|
||||
|
||||
add $16, %rdx
|
||||
|
||||
jmp LoopCmps
|
||||
LeaveLoopCmps: add %rax, %rdx
|
||||
#else
|
||||
mov (%windowbestlen, %rdx), %rax
|
||||
xor (%prev, %rdx), %rax
|
||||
jnz LeaveLoopCmps
|
||||
|
||||
mov 8(%windowbestlen, %rdx), %rax
|
||||
xor 8(%prev, %rdx), %rax
|
||||
jnz LeaveLoopCmps8
|
||||
|
||||
mov 16(%windowbestlen, %rdx), %rax
|
||||
xor 16(%prev, %rdx), %rax
|
||||
jnz LeaveLoopCmps16
|
||||
|
||||
add $24, %rdx
|
||||
jnz LoopCmps
|
||||
jmp LenMaximum
|
||||
# if 0
|
||||
/*
|
||||
* This three-liner is tantalizingly simple, but bsf is a slow instruction,
|
||||
* and the complicated alternative down below is quite a bit faster. Sad...
|
||||
*/
|
||||
|
||||
LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
|
||||
shrl $3, %eax /* divide by 8 to get the byte */
|
||||
add %rax, %rdx
|
||||
# else
|
||||
LeaveLoopCmps16:
|
||||
add $8, %rdx
|
||||
LeaveLoopCmps8:
|
||||
add $8, %rdx
|
||||
LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
|
||||
jnz Check16
|
||||
add $4, %rdx
|
||||
shr $32, %rax
|
||||
Check16: testw $0xFFFF, %ax
|
||||
jnz LenLower
|
||||
add $2, %rdx
|
||||
shrl $16, %eax
|
||||
LenLower: subb $1, %al
|
||||
adc $0, %rdx
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* Calculate the length of the match. If it is longer than MAX_MATCH, */
|
||||
/* then automatically accept it as the best possible match and leave. */
|
||||
|
||||
lea (%prev, %rdx), %rax
|
||||
sub %scan, %rax
|
||||
cmpl $MAX_MATCH, %eax
|
||||
jge LenMaximum
|
||||
|
||||
/* If the length of the match is not longer than the best match we */
|
||||
/* have so far, then forget it and return to the lookup loop. */
|
||||
|
||||
cmpl %bestlend, %eax
|
||||
jg LongerMatch
|
||||
mov _windowbestlen, %windowbestlen
|
||||
mov dsPrev, %prev
|
||||
movl _chainlenwmask, %edx
|
||||
jmp LookupLoop
|
||||
|
||||
/* s->match_start = cur_match; */
|
||||
/* best_len = len; */
|
||||
/* if (len >= nice_match) break; */
|
||||
/* scan_end = *(ushf*)(scan+best_len-1); */
|
||||
|
||||
LongerMatch:
|
||||
movl %eax, %bestlend
|
||||
movl %curmatchd, dsMatchStart
|
||||
cmpl %nicematch, %eax
|
||||
jge LeaveNow
|
||||
|
||||
lea (%window, %bestlen), %windowbestlen
|
||||
mov %windowbestlen, _windowbestlen
|
||||
|
||||
movzwl -1(%scan, %rax), %scanend
|
||||
mov dsPrev, %prev
|
||||
movl _chainlenwmask, %chainlenwmask
|
||||
jmp LookupLoop
|
||||
|
||||
/* Accept the current string, with the maximum possible length. */
|
||||
|
||||
LenMaximum:
|
||||
movl $MAX_MATCH, %bestlend
|
||||
movl %curmatchd, dsMatchStart
|
||||
|
||||
/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
|
||||
/* return s->lookahead; */
|
||||
|
||||
LeaveNow:
|
||||
movl dsLookahead, %eax
|
||||
cmpl %eax, %bestlend
|
||||
cmovngl %bestlend, %eax
|
||||
LookaheadRet:
|
||||
|
||||
/* Restore the registers and return from whence we came. */
|
||||
|
||||
mov save_rsi, %rsi
|
||||
mov save_rbx, %rbx
|
||||
mov save_r12, %r12
|
||||
mov save_r13, %r13
|
||||
mov save_r14, %r14
|
||||
mov save_r15, %r15
|
||||
|
||||
ret
|
||||
|
||||
match_init: ret
|
|
@ -1,51 +0,0 @@
|
|||
This is a patched version of zlib, modified to use
|
||||
Pentium-Pro-optimized assembly code in the deflation algorithm. The
|
||||
files changed/added by this patch are:
|
||||
|
||||
README.686
|
||||
match.S
|
||||
|
||||
The speedup that this patch provides varies, depending on whether the
|
||||
compiler used to build the original version of zlib falls afoul of the
|
||||
PPro's speed traps. My own tests show a speedup of around 10-20% at
|
||||
the default compression level, and 20-30% using -9, against a version
|
||||
compiled using gcc 2.7.2.3. Your mileage may vary.
|
||||
|
||||
Note that this code has been tailored for the PPro/PII in particular,
|
||||
and will not perform particuarly well on a Pentium.
|
||||
|
||||
If you are using an assembler other than GNU as, you will have to
|
||||
translate match.S to use your assembler's syntax. (Have fun.)
|
||||
|
||||
Brian Raiter
|
||||
breadbox@muppetlabs.com
|
||||
April, 1998
|
||||
|
||||
|
||||
Added for zlib 1.1.3:
|
||||
|
||||
The patches come from
|
||||
http://www.muppetlabs.com/~breadbox/software/assembly.html
|
||||
|
||||
To compile zlib with this asm file, copy match.S to the zlib directory
|
||||
then do:
|
||||
|
||||
CFLAGS="-O3 -DASMV" ./configure
|
||||
make OBJA=match.o
|
||||
|
||||
|
||||
Update:
|
||||
|
||||
I've been ignoring these assembly routines for years, believing that
|
||||
gcc's generated code had caught up with it sometime around gcc 2.95
|
||||
and the major rearchitecting of the Pentium 4. However, I recently
|
||||
learned that, despite what I believed, this code still has some life
|
||||
in it. On the Pentium 4 and AMD64 chips, it continues to run about 8%
|
||||
faster than the code produced by gcc 4.1.
|
||||
|
||||
In acknowledgement of its continuing usefulness, I've altered the
|
||||
license to match that of the rest of zlib. Share and Enjoy!
|
||||
|
||||
Brian Raiter
|
||||
breadbox@muppetlabs.com
|
||||
April, 2007
|
|
@ -1,357 +0,0 @@
|
|||
/* match.S -- x86 assembly version of the zlib longest_match() function.
|
||||
* Optimized for the Intel 686 chips (PPro and later).
|
||||
*
|
||||
* Copyright (C) 1998, 2007 Brian Raiter <breadbox@muppetlabs.com>
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the author be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#ifndef NO_UNDERLINE
|
||||
#define match_init _match_init
|
||||
#define longest_match _longest_match
|
||||
#endif
|
||||
|
||||
#define MAX_MATCH (258)
|
||||
#define MIN_MATCH (3)
|
||||
#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
|
||||
#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
|
||||
|
||||
/* stack frame offsets */
|
||||
|
||||
#define chainlenwmask 0 /* high word: current chain len */
|
||||
/* low word: s->wmask */
|
||||
#define window 4 /* local copy of s->window */
|
||||
#define windowbestlen 8 /* s->window + bestlen */
|
||||
#define scanstart 16 /* first two bytes of string */
|
||||
#define scanend 12 /* last two bytes of string */
|
||||
#define scanalign 20 /* dword-misalignment of string */
|
||||
#define nicematch 24 /* a good enough match size */
|
||||
#define bestlen 28 /* size of best match so far */
|
||||
#define scan 32 /* ptr to string wanting match */
|
||||
|
||||
#define LocalVarsSize (36)
|
||||
/* saved ebx 36 */
|
||||
/* saved edi 40 */
|
||||
/* saved esi 44 */
|
||||
/* saved ebp 48 */
|
||||
/* return address 52 */
|
||||
#define deflatestate 56 /* the function arguments */
|
||||
#define curmatch 60
|
||||
|
||||
/* All the +zlib1222add offsets are due to the addition of fields
|
||||
* in zlib in the deflate_state structure since the asm code was first written
|
||||
* (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
|
||||
* (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
|
||||
* if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
|
||||
*/
|
||||
|
||||
#define zlib1222add (8)
|
||||
|
||||
#define dsWSize (36+zlib1222add)
|
||||
#define dsWMask (44+zlib1222add)
|
||||
#define dsWindow (48+zlib1222add)
|
||||
#define dsPrev (56+zlib1222add)
|
||||
#define dsMatchLen (88+zlib1222add)
|
||||
#define dsPrevMatch (92+zlib1222add)
|
||||
#define dsStrStart (100+zlib1222add)
|
||||
#define dsMatchStart (104+zlib1222add)
|
||||
#define dsLookahead (108+zlib1222add)
|
||||
#define dsPrevLen (112+zlib1222add)
|
||||
#define dsMaxChainLen (116+zlib1222add)
|
||||
#define dsGoodMatch (132+zlib1222add)
|
||||
#define dsNiceMatch (136+zlib1222add)
|
||||
|
||||
|
||||
.file "match.S"
|
||||
|
||||
.globl match_init, longest_match
|
||||
|
||||
.text
|
||||
|
||||
/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
|
||||
.cfi_sections .debug_frame
|
||||
|
||||
longest_match:
|
||||
|
||||
.cfi_startproc
|
||||
/* Save registers that the compiler may be using, and adjust %esp to */
|
||||
/* make room for our stack frame. */
|
||||
|
||||
pushl %ebp
|
||||
.cfi_def_cfa_offset 8
|
||||
.cfi_offset ebp, -8
|
||||
pushl %edi
|
||||
.cfi_def_cfa_offset 12
|
||||
pushl %esi
|
||||
.cfi_def_cfa_offset 16
|
||||
pushl %ebx
|
||||
.cfi_def_cfa_offset 20
|
||||
subl $LocalVarsSize, %esp
|
||||
.cfi_def_cfa_offset LocalVarsSize+20
|
||||
|
||||
/* Retrieve the function arguments. %ecx will hold cur_match */
|
||||
/* throughout the entire function. %edx will hold the pointer to the */
|
||||
/* deflate_state structure during the function's setup (before */
|
||||
/* entering the main loop). */
|
||||
|
||||
movl deflatestate(%esp), %edx
|
||||
movl curmatch(%esp), %ecx
|
||||
|
||||
/* uInt wmask = s->w_mask; */
|
||||
/* unsigned chain_length = s->max_chain_length; */
|
||||
/* if (s->prev_length >= s->good_match) { */
|
||||
/* chain_length >>= 2; */
|
||||
/* } */
|
||||
|
||||
movl dsPrevLen(%edx), %eax
|
||||
movl dsGoodMatch(%edx), %ebx
|
||||
cmpl %ebx, %eax
|
||||
movl dsWMask(%edx), %eax
|
||||
movl dsMaxChainLen(%edx), %ebx
|
||||
jl LastMatchGood
|
||||
shrl $2, %ebx
|
||||
LastMatchGood:
|
||||
|
||||
/* chainlen is decremented once beforehand so that the function can */
|
||||
/* use the sign flag instead of the zero flag for the exit test. */
|
||||
/* It is then shifted into the high word, to make room for the wmask */
|
||||
/* value, which it will always accompany. */
|
||||
|
||||
decl %ebx
|
||||
shll $16, %ebx
|
||||
orl %eax, %ebx
|
||||
movl %ebx, chainlenwmask(%esp)
|
||||
|
||||
/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
|
||||
|
||||
movl dsNiceMatch(%edx), %eax
|
||||
movl dsLookahead(%edx), %ebx
|
||||
cmpl %eax, %ebx
|
||||
jl LookaheadLess
|
||||
movl %eax, %ebx
|
||||
LookaheadLess: movl %ebx, nicematch(%esp)
|
||||
|
||||
/* register Bytef *scan = s->window + s->strstart; */
|
||||
|
||||
movl dsWindow(%edx), %esi
|
||||
movl %esi, window(%esp)
|
||||
movl dsStrStart(%edx), %ebp
|
||||
lea (%esi,%ebp), %edi
|
||||
movl %edi, scan(%esp)
|
||||
|
||||
/* Determine how many bytes the scan ptr is off from being */
|
||||
/* dword-aligned. */
|
||||
|
||||
movl %edi, %eax
|
||||
negl %eax
|
||||
andl $3, %eax
|
||||
movl %eax, scanalign(%esp)
|
||||
|
||||
/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
|
||||
/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
|
||||
|
||||
movl dsWSize(%edx), %eax
|
||||
subl $MIN_LOOKAHEAD, %eax
|
||||
subl %eax, %ebp
|
||||
jg LimitPositive
|
||||
xorl %ebp, %ebp
|
||||
LimitPositive:
|
||||
|
||||
/* int best_len = s->prev_length; */
|
||||
|
||||
movl dsPrevLen(%edx), %eax
|
||||
movl %eax, bestlen(%esp)
|
||||
|
||||
/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
|
||||
|
||||
addl %eax, %esi
|
||||
movl %esi, windowbestlen(%esp)
|
||||
|
||||
/* register ush scan_start = *(ushf*)scan; */
|
||||
/* register ush scan_end = *(ushf*)(scan+best_len-1); */
|
||||
/* Posf *prev = s->prev; */
|
||||
|
||||
movzwl (%edi), %ebx
|
||||
movl %ebx, scanstart(%esp)
|
||||
movzwl -1(%edi,%eax), %ebx
|
||||
movl %ebx, scanend(%esp)
|
||||
movl dsPrev(%edx), %edi
|
||||
|
||||
/* Jump into the main loop. */
|
||||
|
||||
movl chainlenwmask(%esp), %edx
|
||||
jmp LoopEntry
|
||||
|
||||
.balign 16
|
||||
|
||||
/* do {
|
||||
* match = s->window + cur_match;
|
||||
* if (*(ushf*)(match+best_len-1) != scan_end ||
|
||||
* *(ushf*)match != scan_start) continue;
|
||||
* [...]
|
||||
* } while ((cur_match = prev[cur_match & wmask]) > limit
|
||||
* && --chain_length != 0);
|
||||
*
|
||||
* Here is the inner loop of the function. The function will spend the
|
||||
* majority of its time in this loop, and majority of that time will
|
||||
* be spent in the first ten instructions.
|
||||
*
|
||||
* Within this loop:
|
||||
* %ebx = scanend
|
||||
* %ecx = curmatch
|
||||
* %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
|
||||
* %esi = windowbestlen - i.e., (window + bestlen)
|
||||
* %edi = prev
|
||||
* %ebp = limit
|
||||
*/
|
||||
LookupLoop:
|
||||
andl %edx, %ecx
|
||||
movzwl (%edi,%ecx,2), %ecx
|
||||
cmpl %ebp, %ecx
|
||||
jbe LeaveNow
|
||||
subl $0x00010000, %edx
|
||||
js LeaveNow
|
||||
LoopEntry: movzwl -1(%esi,%ecx), %eax
|
||||
cmpl %ebx, %eax
|
||||
jnz LookupLoop
|
||||
movl window(%esp), %eax
|
||||
movzwl (%eax,%ecx), %eax
|
||||
cmpl scanstart(%esp), %eax
|
||||
jnz LookupLoop
|
||||
|
||||
/* Store the current value of chainlen. */
|
||||
|
||||
movl %edx, chainlenwmask(%esp)
|
||||
|
||||
/* Point %edi to the string under scrutiny, and %esi to the string we */
|
||||
/* are hoping to match it up with. In actuality, %esi and %edi are */
|
||||
/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
|
||||
/* initialized to -(MAX_MATCH_8 - scanalign). */
|
||||
|
||||
movl window(%esp), %esi
|
||||
movl scan(%esp), %edi
|
||||
addl %ecx, %esi
|
||||
movl scanalign(%esp), %eax
|
||||
movl $(-MAX_MATCH_8), %edx
|
||||
lea MAX_MATCH_8(%edi,%eax), %edi
|
||||
lea MAX_MATCH_8(%esi,%eax), %esi
|
||||
|
||||
/* Test the strings for equality, 8 bytes at a time. At the end,
|
||||
* adjust %edx so that it is offset to the exact byte that mismatched.
|
||||
*
|
||||
* We already know at this point that the first three bytes of the
|
||||
* strings match each other, and they can be safely passed over before
|
||||
* starting the compare loop. So what this code does is skip over 0-3
|
||||
* bytes, as much as necessary in order to dword-align the %edi
|
||||
* pointer. (%esi will still be misaligned three times out of four.)
|
||||
*
|
||||
* It should be confessed that this loop usually does not represent
|
||||
* much of the total running time. Replacing it with a more
|
||||
* straightforward "rep cmpsb" would not drastically degrade
|
||||
* performance.
|
||||
*/
|
||||
LoopCmps:
|
||||
movl (%esi,%edx), %eax
|
||||
xorl (%edi,%edx), %eax
|
||||
jnz LeaveLoopCmps
|
||||
movl 4(%esi,%edx), %eax
|
||||
xorl 4(%edi,%edx), %eax
|
||||
jnz LeaveLoopCmps4
|
||||
addl $8, %edx
|
||||
jnz LoopCmps
|
||||
jmp LenMaximum
|
||||
LeaveLoopCmps4: addl $4, %edx
|
||||
LeaveLoopCmps: testl $0x0000FFFF, %eax
|
||||
jnz LenLower
|
||||
addl $2, %edx
|
||||
shrl $16, %eax
|
||||
LenLower: subb $1, %al
|
||||
adcl $0, %edx
|
||||
|
||||
/* Calculate the length of the match. If it is longer than MAX_MATCH, */
|
||||
/* then automatically accept it as the best possible match and leave. */
|
||||
|
||||
lea (%edi,%edx), %eax
|
||||
movl scan(%esp), %edi
|
||||
subl %edi, %eax
|
||||
cmpl $MAX_MATCH, %eax
|
||||
jge LenMaximum
|
||||
|
||||
/* If the length of the match is not longer than the best match we */
|
||||
/* have so far, then forget it and return to the lookup loop. */
|
||||
|
||||
movl deflatestate(%esp), %edx
|
||||
movl bestlen(%esp), %ebx
|
||||
cmpl %ebx, %eax
|
||||
jg LongerMatch
|
||||
movl windowbestlen(%esp), %esi
|
||||
movl dsPrev(%edx), %edi
|
||||
movl scanend(%esp), %ebx
|
||||
movl chainlenwmask(%esp), %edx
|
||||
jmp LookupLoop
|
||||
|
||||
/* s->match_start = cur_match; */
|
||||
/* best_len = len; */
|
||||
/* if (len >= nice_match) break; */
|
||||
/* scan_end = *(ushf*)(scan+best_len-1); */
|
||||
|
||||
LongerMatch: movl nicematch(%esp), %ebx
|
||||
movl %eax, bestlen(%esp)
|
||||
movl %ecx, dsMatchStart(%edx)
|
||||
cmpl %ebx, %eax
|
||||
jge LeaveNow
|
||||
movl window(%esp), %esi
|
||||
addl %eax, %esi
|
||||
movl %esi, windowbestlen(%esp)
|
||||
movzwl -1(%edi,%eax), %ebx
|
||||
movl dsPrev(%edx), %edi
|
||||
movl %ebx, scanend(%esp)
|
||||
movl chainlenwmask(%esp), %edx
|
||||
jmp LookupLoop
|
||||
|
||||
/* Accept the current string, with the maximum possible length. */
|
||||
|
||||
LenMaximum: movl deflatestate(%esp), %edx
|
||||
movl $MAX_MATCH, bestlen(%esp)
|
||||
movl %ecx, dsMatchStart(%edx)
|
||||
|
||||
/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
|
||||
/* return s->lookahead; */
|
||||
|
||||
LeaveNow:
|
||||
movl deflatestate(%esp), %edx
|
||||
movl bestlen(%esp), %ebx
|
||||
movl dsLookahead(%edx), %eax
|
||||
cmpl %eax, %ebx
|
||||
jg LookaheadRet
|
||||
movl %ebx, %eax
|
||||
LookaheadRet:
|
||||
|
||||
/* Restore the stack and return from whence we came. */
|
||||
|
||||
addl $LocalVarsSize, %esp
|
||||
.cfi_def_cfa_offset 20
|
||||
popl %ebx
|
||||
.cfi_def_cfa_offset 16
|
||||
popl %esi
|
||||
.cfi_def_cfa_offset 12
|
||||
popl %edi
|
||||
.cfi_def_cfa_offset 8
|
||||
popl %ebp
|
||||
.cfi_def_cfa_offset 4
|
||||
.cfi_endproc
|
||||
match_init: ret
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,2 +0,0 @@
|
|||
ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
|
||||
ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
|
|
@ -1,553 +0,0 @@
|
|||
;uInt longest_match_x64(
|
||||
; deflate_state *s,
|
||||
; IPos cur_match); /* current match */
|
||||
|
||||
; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
|
||||
; (AMD64 on Athlon 64, Opteron, Phenom
|
||||
; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
|
||||
; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
|
||||
;
|
||||
; File written by Gilles Vollant, by converting to assembly the longest_match
|
||||
; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
|
||||
;
|
||||
; and by taking inspiration on asm686 with masm, optimised assembly code
|
||||
; from Brian Raiter, written 1998
|
||||
;
|
||||
; This software is provided 'as-is', without any express or implied
|
||||
; warranty. In no event will the authors be held liable for any damages
|
||||
; arising from the use of this software.
|
||||
;
|
||||
; Permission is granted to anyone to use this software for any purpose,
|
||||
; including commercial applications, and to alter it and redistribute it
|
||||
; freely, subject to the following restrictions:
|
||||
;
|
||||
; 1. The origin of this software must not be misrepresented; you must not
|
||||
; claim that you wrote the original software. If you use this software
|
||||
; in a product, an acknowledgment in the product documentation would be
|
||||
; appreciated but is not required.
|
||||
; 2. Altered source versions must be plainly marked as such, and must not be
|
||||
; misrepresented as being the original software
|
||||
; 3. This notice may not be removed or altered from any source distribution.
|
||||
;
|
||||
;
|
||||
;
|
||||
; http://www.zlib.net
|
||||
; http://www.winimage.com/zLibDll
|
||||
; http://www.muppetlabs.com/~breadbox/software/assembly.html
|
||||
;
|
||||
; to compile this file for infozip Zip, I use option:
|
||||
; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
|
||||
;
|
||||
; to compile this file for zLib, I use option:
|
||||
; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
|
||||
; Be carrefull to adapt zlib1222add below to your version of zLib
|
||||
; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
|
||||
; value of zlib1222add later)
|
||||
;
|
||||
; This file compile with Microsoft Macro Assembler (x64) for AMD64
|
||||
;
|
||||
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
|
||||
;
|
||||
; (you can get Windows WDK with ml64 for AMD64 from
|
||||
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
|
||||
;
|
||||
|
||||
|
||||
;uInt longest_match(s, cur_match)
|
||||
; deflate_state *s;
|
||||
; IPos cur_match; /* current match */
|
||||
.code
|
||||
longest_match PROC
|
||||
|
||||
|
||||
;LocalVarsSize equ 88
|
||||
LocalVarsSize equ 72
|
||||
|
||||
; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
|
||||
; free register : r14,r15
|
||||
; register can be saved : rsp
|
||||
|
||||
chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len
|
||||
; low word: s->wmask
|
||||
;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10
|
||||
;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11
|
||||
;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w
|
||||
;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx
|
||||
;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13
|
||||
;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d
|
||||
;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9
|
||||
IFDEF INFOZIP
|
||||
ELSE
|
||||
nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size
|
||||
ENDIF
|
||||
|
||||
save_rdi equ rsp + 24 - LocalVarsSize
|
||||
save_rsi equ rsp + 32 - LocalVarsSize
|
||||
save_rbx equ rsp + 40 - LocalVarsSize
|
||||
save_rbp equ rsp + 48 - LocalVarsSize
|
||||
save_r12 equ rsp + 56 - LocalVarsSize
|
||||
save_r13 equ rsp + 64 - LocalVarsSize
|
||||
;save_r14 equ rsp + 72 - LocalVarsSize
|
||||
;save_r15 equ rsp + 80 - LocalVarsSize
|
||||
|
||||
|
||||
; summary of register usage
|
||||
; scanend ebx
|
||||
; scanendw bx
|
||||
; chainlenwmask edx
|
||||
; curmatch rsi
|
||||
; curmatchd esi
|
||||
; windowbestlen r8
|
||||
; scanalign r9
|
||||
; scanalignd r9d
|
||||
; window r10
|
||||
; bestlen r11
|
||||
; bestlend r11d
|
||||
; scanstart r12d
|
||||
; scanstartw r12w
|
||||
; scan r13
|
||||
; nicematch r14d
|
||||
; limit r15
|
||||
; limitd r15d
|
||||
; prev rcx
|
||||
|
||||
; all the +4 offsets are due to the addition of pending_buf_size (in zlib
|
||||
; in the deflate_state structure since the asm code was first written
|
||||
; (if you compile with zlib 1.0.4 or older, remove the +4).
|
||||
; Note : these value are good with a 8 bytes boundary pack structure
|
||||
|
||||
|
||||
MAX_MATCH equ 258
|
||||
MIN_MATCH equ 3
|
||||
MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
|
||||
|
||||
|
||||
;;; Offsets for fields in the deflate_state structure. These numbers
|
||||
;;; are calculated from the definition of deflate_state, with the
|
||||
;;; assumption that the compiler will dword-align the fields. (Thus,
|
||||
;;; changing the definition of deflate_state could easily cause this
|
||||
;;; program to crash horribly, without so much as a warning at
|
||||
;;; compile time. Sigh.)
|
||||
|
||||
; all the +zlib1222add offsets are due to the addition of fields
|
||||
; in zlib in the deflate_state structure since the asm code was first written
|
||||
; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
|
||||
; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
|
||||
; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
|
||||
|
||||
|
||||
IFDEF INFOZIP
|
||||
|
||||
_DATA SEGMENT
|
||||
COMM window_size:DWORD
|
||||
; WMask ; 7fff
|
||||
COMM window:BYTE:010040H
|
||||
COMM prev:WORD:08000H
|
||||
; MatchLen : unused
|
||||
; PrevMatch : unused
|
||||
COMM strstart:DWORD
|
||||
COMM match_start:DWORD
|
||||
; Lookahead : ignore
|
||||
COMM prev_length:DWORD ; PrevLen
|
||||
COMM max_chain_length:DWORD
|
||||
COMM good_match:DWORD
|
||||
COMM nice_match:DWORD
|
||||
prev_ad equ OFFSET prev
|
||||
window_ad equ OFFSET window
|
||||
nicematch equ nice_match
|
||||
_DATA ENDS
|
||||
WMask equ 07fffh
|
||||
|
||||
ELSE
|
||||
|
||||
IFNDEF zlib1222add
|
||||
zlib1222add equ 8
|
||||
ENDIF
|
||||
dsWSize equ 56+zlib1222add+(zlib1222add/2)
|
||||
dsWMask equ 64+zlib1222add+(zlib1222add/2)
|
||||
dsWindow equ 72+zlib1222add
|
||||
dsPrev equ 88+zlib1222add
|
||||
dsMatchLen equ 128+zlib1222add
|
||||
dsPrevMatch equ 132+zlib1222add
|
||||
dsStrStart equ 140+zlib1222add
|
||||
dsMatchStart equ 144+zlib1222add
|
||||
dsLookahead equ 148+zlib1222add
|
||||
dsPrevLen equ 152+zlib1222add
|
||||
dsMaxChainLen equ 156+zlib1222add
|
||||
dsGoodMatch equ 172+zlib1222add
|
||||
dsNiceMatch equ 176+zlib1222add
|
||||
|
||||
window_size equ [ rcx + dsWSize]
|
||||
WMask equ [ rcx + dsWMask]
|
||||
window_ad equ [ rcx + dsWindow]
|
||||
prev_ad equ [ rcx + dsPrev]
|
||||
strstart equ [ rcx + dsStrStart]
|
||||
match_start equ [ rcx + dsMatchStart]
|
||||
Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
|
||||
prev_length equ [ rcx + dsPrevLen]
|
||||
max_chain_length equ [ rcx + dsMaxChainLen]
|
||||
good_match equ [ rcx + dsGoodMatch]
|
||||
nice_match equ [ rcx + dsNiceMatch]
|
||||
ENDIF
|
||||
|
||||
; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
|
||||
|
||||
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
|
||||
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
|
||||
;
|
||||
; All registers must be preserved across the call, except for
|
||||
; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
|
||||
|
||||
|
||||
|
||||
;;; Save registers that the compiler may be using, and adjust esp to
|
||||
;;; make room for our stack frame.
|
||||
|
||||
|
||||
;;; Retrieve the function arguments. r8d will hold cur_match
|
||||
;;; throughout the entire function. edx will hold the pointer to the
|
||||
;;; deflate_state structure during the function's setup (before
|
||||
;;; entering the main loop.
|
||||
|
||||
; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
|
||||
|
||||
; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
|
||||
|
||||
mov [save_rdi],rdi
|
||||
mov [save_rsi],rsi
|
||||
mov [save_rbx],rbx
|
||||
mov [save_rbp],rbp
|
||||
IFDEF INFOZIP
|
||||
mov r8d,ecx
|
||||
ELSE
|
||||
mov r8d,edx
|
||||
ENDIF
|
||||
mov [save_r12],r12
|
||||
mov [save_r13],r13
|
||||
; mov [save_r14],r14
|
||||
; mov [save_r15],r15
|
||||
|
||||
|
||||
;;; uInt wmask = s->w_mask;
|
||||
;;; unsigned chain_length = s->max_chain_length;
|
||||
;;; if (s->prev_length >= s->good_match) {
|
||||
;;; chain_length >>= 2;
|
||||
;;; }
|
||||
|
||||
mov edi, prev_length
|
||||
mov esi, good_match
|
||||
mov eax, WMask
|
||||
mov ebx, max_chain_length
|
||||
cmp edi, esi
|
||||
jl LastMatchGood
|
||||
shr ebx, 2
|
||||
LastMatchGood:
|
||||
|
||||
;;; chainlen is decremented once beforehand so that the function can
|
||||
;;; use the sign flag instead of the zero flag for the exit test.
|
||||
;;; It is then shifted into the high word, to make room for the wmask
|
||||
;;; value, which it will always accompany.
|
||||
|
||||
dec ebx
|
||||
shl ebx, 16
|
||||
or ebx, eax
|
||||
|
||||
;;; on zlib only
|
||||
;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
|
||||
|
||||
IFDEF INFOZIP
|
||||
mov [chainlenwmask], ebx
|
||||
; on infozip nice_match = [nice_match]
|
||||
ELSE
|
||||
mov eax, nice_match
|
||||
mov [chainlenwmask], ebx
|
||||
mov r10d, Lookahead
|
||||
cmp r10d, eax
|
||||
cmovnl r10d, eax
|
||||
mov [nicematch],r10d
|
||||
ENDIF
|
||||
|
||||
;;; register Bytef *scan = s->window + s->strstart;
|
||||
mov r10, window_ad
|
||||
mov ebp, strstart
|
||||
lea r13, [r10 + rbp]
|
||||
|
||||
;;; Determine how many bytes the scan ptr is off from being
|
||||
;;; dword-aligned.
|
||||
|
||||
mov r9,r13
|
||||
neg r13
|
||||
and r13,3
|
||||
|
||||
;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
|
||||
;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
|
||||
IFDEF INFOZIP
|
||||
mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
|
||||
ELSE
|
||||
mov eax, window_size
|
||||
sub eax, MIN_LOOKAHEAD
|
||||
ENDIF
|
||||
xor edi,edi
|
||||
sub ebp, eax
|
||||
|
||||
mov r11d, prev_length
|
||||
|
||||
cmovng ebp,edi
|
||||
|
||||
;;; int best_len = s->prev_length;
|
||||
|
||||
|
||||
;;; Store the sum of s->window + best_len in esi locally, and in esi.
|
||||
|
||||
lea rsi,[r10+r11]
|
||||
|
||||
;;; register ush scan_start = *(ushf*)scan;
|
||||
;;; register ush scan_end = *(ushf*)(scan+best_len-1);
|
||||
;;; Posf *prev = s->prev;
|
||||
|
||||
movzx r12d,word ptr [r9]
|
||||
movzx ebx, word ptr [r9 + r11 - 1]
|
||||
|
||||
mov rdi, prev_ad
|
||||
|
||||
;;; Jump into the main loop.
|
||||
|
||||
mov edx, [chainlenwmask]
|
||||
|
||||
cmp bx,word ptr [rsi + r8 - 1]
|
||||
jz LookupLoopIsZero
|
||||
|
||||
LookupLoop1:
|
||||
and r8d, edx
|
||||
|
||||
movzx r8d, word ptr [rdi + r8*2]
|
||||
cmp r8d, ebp
|
||||
jbe LeaveNow
|
||||
sub edx, 00010000h
|
||||
js LeaveNow
|
||||
|
||||
LoopEntry1:
|
||||
cmp bx,word ptr [rsi + r8 - 1]
|
||||
jz LookupLoopIsZero
|
||||
|
||||
LookupLoop2:
|
||||
and r8d, edx
|
||||
|
||||
movzx r8d, word ptr [rdi + r8*2]
|
||||
cmp r8d, ebp
|
||||
jbe LeaveNow
|
||||
sub edx, 00010000h
|
||||
js LeaveNow
|
||||
|
||||
LoopEntry2:
|
||||
cmp bx,word ptr [rsi + r8 - 1]
|
||||
jz LookupLoopIsZero
|
||||
|
||||
LookupLoop4:
|
||||
and r8d, edx
|
||||
|
||||
movzx r8d, word ptr [rdi + r8*2]
|
||||
cmp r8d, ebp
|
||||
jbe LeaveNow
|
||||
sub edx, 00010000h
|
||||
js LeaveNow
|
||||
|
||||
LoopEntry4:
|
||||
|
||||
cmp bx,word ptr [rsi + r8 - 1]
|
||||
jnz LookupLoop1
|
||||
jmp LookupLoopIsZero
|
||||
|
||||
|
||||
;;; do {
|
||||
;;; match = s->window + cur_match;
|
||||
;;; if (*(ushf*)(match+best_len-1) != scan_end ||
|
||||
;;; *(ushf*)match != scan_start) continue;
|
||||
;;; [...]
|
||||
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
|
||||
;;; && --chain_length != 0);
|
||||
;;;
|
||||
;;; Here is the inner loop of the function. The function will spend the
|
||||
;;; majority of its time in this loop, and majority of that time will
|
||||
;;; be spent in the first ten instructions.
|
||||
;;;
|
||||
;;; Within this loop:
|
||||
;;; ebx = scanend
|
||||
;;; r8d = curmatch
|
||||
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
|
||||
;;; esi = windowbestlen - i.e., (window + bestlen)
|
||||
;;; edi = prev
|
||||
;;; ebp = limit
|
||||
|
||||
LookupLoop:
|
||||
and r8d, edx
|
||||
|
||||
movzx r8d, word ptr [rdi + r8*2]
|
||||
cmp r8d, ebp
|
||||
jbe LeaveNow
|
||||
sub edx, 00010000h
|
||||
js LeaveNow
|
||||
|
||||
LoopEntry:
|
||||
|
||||
cmp bx,word ptr [rsi + r8 - 1]
|
||||
jnz LookupLoop1
|
||||
LookupLoopIsZero:
|
||||
cmp r12w, word ptr [r10 + r8]
|
||||
jnz LookupLoop1
|
||||
|
||||
|
||||
;;; Store the current value of chainlen.
|
||||
mov [chainlenwmask], edx
|
||||
|
||||
;;; Point edi to the string under scrutiny, and esi to the string we
|
||||
;;; are hoping to match it up with. In actuality, esi and edi are
|
||||
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
|
||||
;;; initialized to -(MAX_MATCH_8 - scanalign).
|
||||
|
||||
lea rsi,[r8+r10]
|
||||
mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
|
||||
lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
|
||||
lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
|
||||
|
||||
prefetcht1 [rsi+rdx]
|
||||
prefetcht1 [rdi+rdx]
|
||||
|
||||
|
||||
;;; Test the strings for equality, 8 bytes at a time. At the end,
|
||||
;;; adjust rdx so that it is offset to the exact byte that mismatched.
|
||||
;;;
|
||||
;;; We already know at this point that the first three bytes of the
|
||||
;;; strings match each other, and they can be safely passed over before
|
||||
;;; starting the compare loop. So what this code does is skip over 0-3
|
||||
;;; bytes, as much as necessary in order to dword-align the edi
|
||||
;;; pointer. (rsi will still be misaligned three times out of four.)
|
||||
;;;
|
||||
;;; It should be confessed that this loop usually does not represent
|
||||
;;; much of the total running time. Replacing it with a more
|
||||
;;; straightforward "rep cmpsb" would not drastically degrade
|
||||
;;; performance.
|
||||
|
||||
|
||||
LoopCmps:
|
||||
mov rax, [rsi + rdx]
|
||||
xor rax, [rdi + rdx]
|
||||
jnz LeaveLoopCmps
|
||||
|
||||
mov rax, [rsi + rdx + 8]
|
||||
xor rax, [rdi + rdx + 8]
|
||||
jnz LeaveLoopCmps8
|
||||
|
||||
|
||||
mov rax, [rsi + rdx + 8+8]
|
||||
xor rax, [rdi + rdx + 8+8]
|
||||
jnz LeaveLoopCmps16
|
||||
|
||||
add rdx,8+8+8
|
||||
|
||||
jnz short LoopCmps
|
||||
jmp short LenMaximum
|
||||
LeaveLoopCmps16: add rdx,8
|
||||
LeaveLoopCmps8: add rdx,8
|
||||
LeaveLoopCmps:
|
||||
|
||||
test eax, 0000FFFFh
|
||||
jnz LenLower
|
||||
|
||||
test eax,0ffffffffh
|
||||
|
||||
jnz LenLower32
|
||||
|
||||
add rdx,4
|
||||
shr rax,32
|
||||
or ax,ax
|
||||
jnz LenLower
|
||||
|
||||
LenLower32:
|
||||
shr eax,16
|
||||
add rdx,2
|
||||
LenLower: sub al, 1
|
||||
adc rdx, 0
|
||||
;;; Calculate the length of the match. If it is longer than MAX_MATCH,
|
||||
;;; then automatically accept it as the best possible match and leave.
|
||||
|
||||
lea rax, [rdi + rdx]
|
||||
sub rax, r9
|
||||
cmp eax, MAX_MATCH
|
||||
jge LenMaximum
|
||||
|
||||
;;; If the length of the match is not longer than the best match we
|
||||
;;; have so far, then forget it and return to the lookup loop.
|
||||
;///////////////////////////////////
|
||||
|
||||
cmp eax, r11d
|
||||
jg LongerMatch
|
||||
|
||||
lea rsi,[r10+r11]
|
||||
|
||||
mov rdi, prev_ad
|
||||
mov edx, [chainlenwmask]
|
||||
jmp LookupLoop
|
||||
|
||||
;;; s->match_start = cur_match;
|
||||
;;; best_len = len;
|
||||
;;; if (len >= nice_match) break;
|
||||
;;; scan_end = *(ushf*)(scan+best_len-1);
|
||||
|
||||
LongerMatch:
|
||||
mov r11d, eax
|
||||
mov match_start, r8d
|
||||
cmp eax, [nicematch]
|
||||
jge LeaveNow
|
||||
|
||||
lea rsi,[r10+rax]
|
||||
|
||||
movzx ebx, word ptr [r9 + rax - 1]
|
||||
mov rdi, prev_ad
|
||||
mov edx, [chainlenwmask]
|
||||
jmp LookupLoop
|
||||
|
||||
;;; Accept the current string, with the maximum possible length.
|
||||
|
||||
LenMaximum:
|
||||
mov r11d,MAX_MATCH
|
||||
mov match_start, r8d
|
||||
|
||||
;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
|
||||
;;; return s->lookahead;
|
||||
|
||||
LeaveNow:
|
||||
IFDEF INFOZIP
|
||||
mov eax,r11d
|
||||
ELSE
|
||||
mov eax, Lookahead
|
||||
cmp r11d, eax
|
||||
cmovng eax, r11d
|
||||
ENDIF
|
||||
|
||||
;;; Restore the stack and return from whence we came.
|
||||
|
||||
|
||||
mov rsi,[save_rsi]
|
||||
mov rdi,[save_rdi]
|
||||
mov rbx,[save_rbx]
|
||||
mov rbp,[save_rbp]
|
||||
mov r12,[save_r12]
|
||||
mov r13,[save_r13]
|
||||
; mov r14,[save_r14]
|
||||
; mov r15,[save_r15]
|
||||
|
||||
|
||||
ret 0
|
||||
; please don't remove this string !
|
||||
; Your can freely use gvmat64 in any free or commercial app
|
||||
; but it is far better don't remove the string in the binary!
|
||||
db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
|
||||
longest_match ENDP
|
||||
|
||||
match_init PROC
|
||||
ret 0
|
||||
match_init ENDP
|
||||
|
||||
|
||||
END
|
|
@ -1,186 +0,0 @@
|
|||
/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
|
||||
* version for AMD64 on Windows using Microsoft C compiler
|
||||
*
|
||||
* Copyright (C) 1995-2003 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*
|
||||
* Copyright (C) 2003 Chris Anderson <christop@charm.net>
|
||||
* Please use the copyright conditions above.
|
||||
*
|
||||
* 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
|
||||
*
|
||||
* inffas8664.c call function inffas8664fnc in inffasx64.asm
|
||||
* inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
|
||||
*
|
||||
* Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
|
||||
* slightly quicker on x86 systems because, instead of using rep movsb to copy
|
||||
* data, it uses rep movsw, which moves data in 2-byte chunks instead of single
|
||||
* bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
|
||||
* from http://fedora.linux.duke.edu/fc1_x86_64
|
||||
* which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
|
||||
* 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
|
||||
* when decompressing mozilla-source-1.3.tar.gz.
|
||||
*
|
||||
* Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
|
||||
* the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
|
||||
* the moment. I have successfully compiled and tested this code with gcc2.96,
|
||||
* gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
|
||||
* compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
|
||||
* enabled. I will attempt to merge the MMX code into this version. Newer
|
||||
* versions of this and inffast.S can be found at
|
||||
* http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "zutil.h"
|
||||
#include "inftrees.h"
|
||||
#include "inflate.h"
|
||||
#include "inffast.h"
|
||||
|
||||
/* Mark Adler's comments from inffast.c: */
|
||||
|
||||
/*
|
||||
Decode literal, length, and distance codes and write out the resulting
|
||||
literal and match bytes until either not enough input or output is
|
||||
available, an end-of-block is encountered, or a data error is encountered.
|
||||
When large enough input and output buffers are supplied to inflate(), for
|
||||
example, a 16K input buffer and a 64K output buffer, more than 95% of the
|
||||
inflate execution time is spent in this routine.
|
||||
|
||||
Entry assumptions:
|
||||
|
||||
state->mode == LEN
|
||||
strm->avail_in >= 6
|
||||
strm->avail_out >= 258
|
||||
start >= strm->avail_out
|
||||
state->bits < 8
|
||||
|
||||
On return, state->mode is one of:
|
||||
|
||||
LEN -- ran out of enough output space or enough available input
|
||||
TYPE -- reached end of block code, inflate() to interpret next block
|
||||
BAD -- error in block data
|
||||
|
||||
Notes:
|
||||
|
||||
- The maximum input bits used by a length/distance pair is 15 bits for the
|
||||
length code, 5 bits for the length extra, 15 bits for the distance code,
|
||||
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
|
||||
Therefore if strm->avail_in >= 6, then there is enough input to avoid
|
||||
checking for available input while decoding.
|
||||
|
||||
- The maximum bytes that a single length/distance pair can output is 258
|
||||
bytes, which is the maximum length that can be coded. inflate_fast()
|
||||
requires strm->avail_out >= 258 for each loop to avoid checking for
|
||||
output space.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
typedef struct inffast_ar {
|
||||
/* 64 32 x86 x86_64 */
|
||||
/* ar offset register */
|
||||
/* 0 0 */ void *esp; /* esp save */
|
||||
/* 8 4 */ void *ebp; /* ebp save */
|
||||
/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
|
||||
/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
|
||||
/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
|
||||
/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
|
||||
/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
|
||||
/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
|
||||
/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
|
||||
/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
|
||||
/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
|
||||
/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
|
||||
/* 92 48 */ unsigned wsize; /* window size */
|
||||
/* 96 52 */ unsigned write; /* window write index */
|
||||
/*100 56 */ unsigned lmask; /* r12 mask for lcode */
|
||||
/*104 60 */ unsigned dmask; /* r13 mask for dcode */
|
||||
/*108 64 */ unsigned len; /* r14 match length */
|
||||
/*112 68 */ unsigned dist; /* r15 match distance */
|
||||
/*116 72 */ unsigned status; /* set when state chng*/
|
||||
} type_ar;
|
||||
#ifdef ASMINF
|
||||
|
||||
void inflate_fast(strm, start)
|
||||
z_streamp strm;
|
||||
unsigned start; /* inflate()'s starting value for strm->avail_out */
|
||||
{
|
||||
struct inflate_state FAR *state;
|
||||
type_ar ar;
|
||||
void inffas8664fnc(struct inffast_ar * par);
|
||||
|
||||
|
||||
|
||||
#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
|
||||
#define PAD_AVAIL_IN 6
|
||||
#define PAD_AVAIL_OUT 258
|
||||
#else
|
||||
#define PAD_AVAIL_IN 5
|
||||
#define PAD_AVAIL_OUT 257
|
||||
#endif
|
||||
|
||||
/* copy state to local variables */
|
||||
state = (struct inflate_state FAR *)strm->state;
|
||||
|
||||
ar.in = strm->next_in;
|
||||
ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
|
||||
ar.out = strm->next_out;
|
||||
ar.beg = ar.out - (start - strm->avail_out);
|
||||
ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
|
||||
ar.wsize = state->wsize;
|
||||
ar.write = state->wnext;
|
||||
ar.window = state->window;
|
||||
ar.hold = state->hold;
|
||||
ar.bits = state->bits;
|
||||
ar.lcode = state->lencode;
|
||||
ar.dcode = state->distcode;
|
||||
ar.lmask = (1U << state->lenbits) - 1;
|
||||
ar.dmask = (1U << state->distbits) - 1;
|
||||
|
||||
/* decode literals and length/distances until end-of-block or not enough
|
||||
input data or output space */
|
||||
|
||||
/* align in on 1/2 hold size boundary */
|
||||
while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
|
||||
ar.hold += (unsigned long)*ar.in++ << ar.bits;
|
||||
ar.bits += 8;
|
||||
}
|
||||
|
||||
inffas8664fnc(&ar);
|
||||
|
||||
if (ar.status > 1) {
|
||||
if (ar.status == 2)
|
||||
strm->msg = "invalid literal/length code";
|
||||
else if (ar.status == 3)
|
||||
strm->msg = "invalid distance code";
|
||||
else
|
||||
strm->msg = "invalid distance too far back";
|
||||
state->mode = BAD;
|
||||
}
|
||||
else if ( ar.status == 1 ) {
|
||||
state->mode = TYPE;
|
||||
}
|
||||
|
||||
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
|
||||
ar.len = ar.bits >> 3;
|
||||
ar.in -= ar.len;
|
||||
ar.bits -= ar.len << 3;
|
||||
ar.hold &= (1U << ar.bits) - 1;
|
||||
|
||||
/* update state and return */
|
||||
strm->next_in = ar.in;
|
||||
strm->next_out = ar.out;
|
||||
strm->avail_in = (unsigned)(ar.in < ar.last ?
|
||||
PAD_AVAIL_IN + (ar.last - ar.in) :
|
||||
PAD_AVAIL_IN - (ar.in - ar.last));
|
||||
strm->avail_out = (unsigned)(ar.out < ar.end ?
|
||||
PAD_AVAIL_OUT + (ar.end - ar.out) :
|
||||
PAD_AVAIL_OUT - (ar.out - ar.end));
|
||||
state->hold = (unsigned long)ar.hold;
|
||||
state->bits = ar.bits;
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,396 +0,0 @@
|
|||
; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
|
||||
; version for AMD64 on Windows using Microsoft C compiler
|
||||
;
|
||||
; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
|
||||
; inffasx64.asm is called by inffas8664.c, which contain more info.
|
||||
|
||||
|
||||
; to compile this file, I use option
|
||||
; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
|
||||
; with Microsoft Macro Assembler (x64) for AMD64
|
||||
;
|
||||
|
||||
; This file compile with Microsoft Macro Assembler (x64) for AMD64
|
||||
;
|
||||
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
|
||||
;
|
||||
; (you can get Windows WDK with ml64 for AMD64 from
|
||||
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
|
||||
;
|
||||
|
||||
|
||||
.code
|
||||
inffas8664fnc PROC
|
||||
|
||||
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
|
||||
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
|
||||
;
|
||||
; All registers must be preserved across the call, except for
|
||||
; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
|
||||
|
||||
|
||||
mov [rsp-8],rsi
|
||||
mov [rsp-16],rdi
|
||||
mov [rsp-24],r12
|
||||
mov [rsp-32],r13
|
||||
mov [rsp-40],r14
|
||||
mov [rsp-48],r15
|
||||
mov [rsp-56],rbx
|
||||
|
||||
mov rax,rcx
|
||||
|
||||
mov [rax+8], rbp ; /* save regs rbp and rsp */
|
||||
mov [rax], rsp
|
||||
|
||||
mov rsp, rax ; /* make rsp point to &ar */
|
||||
|
||||
mov rsi, [rsp+16] ; /* rsi = in */
|
||||
mov rdi, [rsp+32] ; /* rdi = out */
|
||||
mov r9, [rsp+24] ; /* r9 = last */
|
||||
mov r10, [rsp+48] ; /* r10 = end */
|
||||
mov rbp, [rsp+64] ; /* rbp = lcode */
|
||||
mov r11, [rsp+72] ; /* r11 = dcode */
|
||||
mov rdx, [rsp+80] ; /* rdx = hold */
|
||||
mov ebx, [rsp+88] ; /* ebx = bits */
|
||||
mov r12d, [rsp+100] ; /* r12d = lmask */
|
||||
mov r13d, [rsp+104] ; /* r13d = dmask */
|
||||
; /* r14d = len */
|
||||
; /* r15d = dist */
|
||||
|
||||
|
||||
cld
|
||||
cmp r10, rdi
|
||||
je L_one_time ; /* if only one decode left */
|
||||
cmp r9, rsi
|
||||
|
||||
jne L_do_loop
|
||||
|
||||
|
||||
L_one_time:
|
||||
mov r8, r12 ; /* r8 = lmask */
|
||||
cmp bl, 32
|
||||
ja L_get_length_code_one_time
|
||||
|
||||
lodsd ; /* eax = *(uint *)in++ */
|
||||
mov cl, bl ; /* cl = bits, needs it for shifting */
|
||||
add bl, 32 ; /* bits += 32 */
|
||||
shl rax, cl
|
||||
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
|
||||
jmp L_get_length_code_one_time
|
||||
|
||||
ALIGN 4
|
||||
L_while_test:
|
||||
cmp r10, rdi
|
||||
jbe L_break_loop
|
||||
cmp r9, rsi
|
||||
jbe L_break_loop
|
||||
|
||||
L_do_loop:
|
||||
mov r8, r12 ; /* r8 = lmask */
|
||||
cmp bl, 32
|
||||
ja L_get_length_code ; /* if (32 < bits) */
|
||||
|
||||
lodsd ; /* eax = *(uint *)in++ */
|
||||
mov cl, bl ; /* cl = bits, needs it for shifting */
|
||||
add bl, 32 ; /* bits += 32 */
|
||||
shl rax, cl
|
||||
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
|
||||
|
||||
L_get_length_code:
|
||||
and r8, rdx ; /* r8 &= hold */
|
||||
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
|
||||
|
||||
mov cl, ah ; /* cl = this.bits */
|
||||
sub bl, ah ; /* bits -= this.bits */
|
||||
shr rdx, cl ; /* hold >>= this.bits */
|
||||
|
||||
test al, al
|
||||
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
|
||||
|
||||
mov r8, r12 ; /* r8 = lmask */
|
||||
shr eax, 16 ; /* output this.val char */
|
||||
stosb
|
||||
|
||||
L_get_length_code_one_time:
|
||||
and r8, rdx ; /* r8 &= hold */
|
||||
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
|
||||
|
||||
L_dolen:
|
||||
mov cl, ah ; /* cl = this.bits */
|
||||
sub bl, ah ; /* bits -= this.bits */
|
||||
shr rdx, cl ; /* hold >>= this.bits */
|
||||
|
||||
test al, al
|
||||
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
|
||||
|
||||
shr eax, 16 ; /* output this.val char */
|
||||
stosb
|
||||
jmp L_while_test
|
||||
|
||||
ALIGN 4
|
||||
L_test_for_length_base:
|
||||
mov r14d, eax ; /* len = this */
|
||||
shr r14d, 16 ; /* len = this.val */
|
||||
mov cl, al
|
||||
|
||||
test al, 16
|
||||
jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
|
||||
and cl, 15 ; /* op &= 15 */
|
||||
jz L_decode_distance ; /* if (!op) */
|
||||
|
||||
L_add_bits_to_len:
|
||||
sub bl, cl
|
||||
xor eax, eax
|
||||
inc eax
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, edx ; /* eax &= hold */
|
||||
shr rdx, cl
|
||||
add r14d, eax ; /* len += hold & mask[op] */
|
||||
|
||||
L_decode_distance:
|
||||
mov r8, r13 ; /* r8 = dmask */
|
||||
cmp bl, 32
|
||||
ja L_get_distance_code ; /* if (32 < bits) */
|
||||
|
||||
lodsd ; /* eax = *(uint *)in++ */
|
||||
mov cl, bl ; /* cl = bits, needs it for shifting */
|
||||
add bl, 32 ; /* bits += 32 */
|
||||
shl rax, cl
|
||||
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
|
||||
|
||||
L_get_distance_code:
|
||||
and r8, rdx ; /* r8 &= hold */
|
||||
mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
|
||||
|
||||
L_dodist:
|
||||
mov r15d, eax ; /* dist = this */
|
||||
shr r15d, 16 ; /* dist = this.val */
|
||||
mov cl, ah
|
||||
sub bl, ah ; /* bits -= this.bits */
|
||||
shr rdx, cl ; /* hold >>= this.bits */
|
||||
mov cl, al ; /* cl = this.op */
|
||||
|
||||
test al, 16 ; /* if ((op & 16) == 0) */
|
||||
jz L_test_for_second_level_dist
|
||||
and cl, 15 ; /* op &= 15 */
|
||||
jz L_check_dist_one
|
||||
|
||||
L_add_bits_to_dist:
|
||||
sub bl, cl
|
||||
xor eax, eax
|
||||
inc eax
|
||||
shl eax, cl
|
||||
dec eax ; /* (1 << op) - 1 */
|
||||
and eax, edx ; /* eax &= hold */
|
||||
shr rdx, cl
|
||||
add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
|
||||
|
||||
L_check_window:
|
||||
mov r8, rsi ; /* save in so from can use it's reg */
|
||||
mov rax, rdi
|
||||
sub rax, [rsp+40] ; /* nbytes = out - beg */
|
||||
|
||||
cmp eax, r15d
|
||||
jb L_clip_window ; /* if (dist > nbytes) 4.2% */
|
||||
|
||||
mov ecx, r14d ; /* ecx = len */
|
||||
mov rsi, rdi
|
||||
sub rsi, r15 ; /* from = out - dist */
|
||||
|
||||
sar ecx, 1
|
||||
jnc L_copy_two ; /* if len % 2 == 0 */
|
||||
|
||||
rep movsw
|
||||
mov al, [rsi]
|
||||
mov [rdi], al
|
||||
inc rdi
|
||||
|
||||
mov rsi, r8 ; /* move in back to %rsi, toss from */
|
||||
jmp L_while_test
|
||||
|
||||
L_copy_two:
|
||||
rep movsw
|
||||
mov rsi, r8 ; /* move in back to %rsi, toss from */
|
||||
jmp L_while_test
|
||||
|
||||
ALIGN 4
|
||||
L_check_dist_one:
|
||||
cmp r15d, 1 ; /* if dist 1, is a memset */
|
||||
jne L_check_window
|
||||
cmp [rsp+40], rdi ; /* if out == beg, outside window */
|
||||
je L_check_window
|
||||
|
||||
mov ecx, r14d ; /* ecx = len */
|
||||
mov al, [rdi-1]
|
||||
mov ah, al
|
||||
|
||||
sar ecx, 1
|
||||
jnc L_set_two
|
||||
mov [rdi], al
|
||||
inc rdi
|
||||
|
||||
L_set_two:
|
||||
rep stosw
|
||||
jmp L_while_test
|
||||
|
||||
ALIGN 4
|
||||
L_test_for_second_level_length:
|
||||
test al, 64
|
||||
jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
|
||||
|
||||
xor eax, eax
|
||||
inc eax
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, edx ; /* eax &= hold */
|
||||
add eax, r14d ; /* eax += len */
|
||||
mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
|
||||
jmp L_dolen
|
||||
|
||||
ALIGN 4
|
||||
L_test_for_second_level_dist:
|
||||
test al, 64
|
||||
jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
|
||||
|
||||
xor eax, eax
|
||||
inc eax
|
||||
shl eax, cl
|
||||
dec eax
|
||||
and eax, edx ; /* eax &= hold */
|
||||
add eax, r15d ; /* eax += dist */
|
||||
mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
|
||||
jmp L_dodist
|
||||
|
||||
ALIGN 4
|
||||
L_clip_window:
|
||||
mov ecx, eax ; /* ecx = nbytes */
|
||||
mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
|
||||
neg ecx ; /* nbytes = -nbytes */
|
||||
|
||||
cmp eax, r15d
|
||||
jb L_invalid_distance_too_far ; /* if (dist > wsize) */
|
||||
|
||||
add ecx, r15d ; /* nbytes = dist - nbytes */
|
||||
cmp dword ptr [rsp+96], 0
|
||||
jne L_wrap_around_window ; /* if (write != 0) */
|
||||
|
||||
mov rsi, [rsp+56] ; /* from = window */
|
||||
sub eax, ecx ; /* eax -= nbytes */
|
||||
add rsi, rax ; /* from += wsize - nbytes */
|
||||
|
||||
mov eax, r14d ; /* eax = len */
|
||||
cmp r14d, ecx
|
||||
jbe L_do_copy ; /* if (nbytes >= len) */
|
||||
|
||||
sub eax, ecx ; /* eax -= nbytes */
|
||||
rep movsb
|
||||
mov rsi, rdi
|
||||
sub rsi, r15 ; /* from = &out[ -dist ] */
|
||||
jmp L_do_copy
|
||||
|
||||
ALIGN 4
|
||||
L_wrap_around_window:
|
||||
mov eax, [rsp+96] ; /* eax = write */
|
||||
cmp ecx, eax
|
||||
jbe L_contiguous_in_window ; /* if (write >= nbytes) */
|
||||
|
||||
mov esi, [rsp+92] ; /* from = wsize */
|
||||
add rsi, [rsp+56] ; /* from += window */
|
||||
add rsi, rax ; /* from += write */
|
||||
sub rsi, rcx ; /* from -= nbytes */
|
||||
sub ecx, eax ; /* nbytes -= write */
|
||||
|
||||
mov eax, r14d ; /* eax = len */
|
||||
cmp eax, ecx
|
||||
jbe L_do_copy ; /* if (nbytes >= len) */
|
||||
|
||||
sub eax, ecx ; /* len -= nbytes */
|
||||
rep movsb
|
||||
mov rsi, [rsp+56] ; /* from = window */
|
||||
mov ecx, [rsp+96] ; /* nbytes = write */
|
||||
cmp eax, ecx
|
||||
jbe L_do_copy ; /* if (nbytes >= len) */
|
||||
|
||||
sub eax, ecx ; /* len -= nbytes */
|
||||
rep movsb
|
||||
mov rsi, rdi
|
||||
sub rsi, r15 ; /* from = out - dist */
|
||||
jmp L_do_copy
|
||||
|
||||
ALIGN 4
|
||||
L_contiguous_in_window:
|
||||
mov rsi, [rsp+56] ; /* rsi = window */
|
||||
add rsi, rax
|
||||
sub rsi, rcx ; /* from += write - nbytes */
|
||||
|
||||
mov eax, r14d ; /* eax = len */
|
||||
cmp eax, ecx
|
||||
jbe L_do_copy ; /* if (nbytes >= len) */
|
||||
|
||||
sub eax, ecx ; /* len -= nbytes */
|
||||
rep movsb
|
||||
mov rsi, rdi
|
||||
sub rsi, r15 ; /* from = out - dist */
|
||||
jmp L_do_copy ; /* if (nbytes >= len) */
|
||||
|
||||
ALIGN 4
|
||||
L_do_copy:
|
||||
mov ecx, eax ; /* ecx = len */
|
||||
rep movsb
|
||||
|
||||
mov rsi, r8 ; /* move in back to %esi, toss from */
|
||||
jmp L_while_test
|
||||
|
||||
L_test_for_end_of_block:
|
||||
test al, 32
|
||||
jz L_invalid_literal_length_code
|
||||
mov dword ptr [rsp+116], 1
|
||||
jmp L_break_loop_with_status
|
||||
|
||||
L_invalid_literal_length_code:
|
||||
mov dword ptr [rsp+116], 2
|
||||
jmp L_break_loop_with_status
|
||||
|
||||
L_invalid_distance_code:
|
||||
mov dword ptr [rsp+116], 3
|
||||
jmp L_break_loop_with_status
|
||||
|
||||
L_invalid_distance_too_far:
|
||||
mov dword ptr [rsp+116], 4
|
||||
jmp L_break_loop_with_status
|
||||
|
||||
L_break_loop:
|
||||
mov dword ptr [rsp+116], 0
|
||||
|
||||
L_break_loop_with_status:
|
||||
; /* put in, out, bits, and hold back into ar and pop esp */
|
||||
mov [rsp+16], rsi ; /* in */
|
||||
mov [rsp+32], rdi ; /* out */
|
||||
mov [rsp+88], ebx ; /* bits */
|
||||
mov [rsp+80], rdx ; /* hold */
|
||||
|
||||
mov rax, [rsp] ; /* restore rbp and rsp */
|
||||
mov rbp, [rsp+8]
|
||||
mov rsp, rax
|
||||
|
||||
|
||||
|
||||
mov rsi,[rsp-8]
|
||||
mov rdi,[rsp-16]
|
||||
mov r12,[rsp-24]
|
||||
mov r13,[rsp-32]
|
||||
mov r14,[rsp-40]
|
||||
mov r15,[rsp-48]
|
||||
mov rbx,[rsp-56]
|
||||
|
||||
ret 0
|
||||
; :
|
||||
; : "m" (ar)
|
||||
; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
|
||||
; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
|
||||
; );
|
||||
|
||||
inffas8664fnc ENDP
|
||||
;_TEXT ENDS
|
||||
END
|
|
@ -1,31 +0,0 @@
|
|||
Summary
|
||||
-------
|
||||
This directory contains ASM implementations of the functions
|
||||
longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
|
||||
for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
|
||||
|
||||
gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
|
||||
assembly optimized version from Jean-loup Gailly original longest_match function
|
||||
|
||||
inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
|
||||
original function from Mark Adler
|
||||
|
||||
Use instructions
|
||||
----------------
|
||||
Assemble the .asm files using MASM and put the object files into the zlib source
|
||||
directory. You can also get object files here:
|
||||
|
||||
http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
|
||||
|
||||
define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
|
||||
and inffasx64.obj and gvmat64.obj as object to link.
|
||||
|
||||
|
||||
Build instructions
|
||||
------------------
|
||||
run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
|
||||
|
||||
ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
|
||||
|
||||
You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
|
||||
http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
|
|
@ -1,2 +0,0 @@
|
|||
ml /coff /Zi /c /Flmatch686.lst match686.asm
|
||||
ml /coff /Zi /c /Flinffas32.lst inffas32.asm
|
File diff suppressed because it is too large
Load Diff
|
@ -1,479 +0,0 @@
|
|||
; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86
|
||||
; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
|
||||
; File written by Gilles Vollant, by converting match686.S from Brian Raiter
|
||||
; for MASM. This is as assembly version of longest_match
|
||||
; from Jean-loup Gailly in deflate.c
|
||||
;
|
||||
; http://www.zlib.net
|
||||
; http://www.winimage.com/zLibDll
|
||||
; http://www.muppetlabs.com/~breadbox/software/assembly.html
|
||||
;
|
||||
; For Visual C++ 4.x and higher and ML 6.x and higher
|
||||
; ml.exe is distributed in
|
||||
; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
|
||||
;
|
||||
; this file contain two implementation of longest_match
|
||||
;
|
||||
; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro
|
||||
; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom)
|
||||
;
|
||||
; for using an assembly version of longest_match, you need define ASMV in project
|
||||
;
|
||||
; compile the asm file running
|
||||
; ml /coff /Zi /c /Flmatch686.lst match686.asm
|
||||
; and do not include match686.obj in your project
|
||||
;
|
||||
; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for
|
||||
; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor
|
||||
; with autoselect (with cpu detection code)
|
||||
; if you want support the old pentium optimization, you can still use these version
|
||||
;
|
||||
; this file is not optimized for old pentium, but it compatible with all x86 32 bits
|
||||
; processor (starting 80386)
|
||||
;
|
||||
;
|
||||
; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
|
||||
|
||||
;uInt longest_match(s, cur_match)
|
||||
; deflate_state *s;
|
||||
; IPos cur_match; /* current match */
|
||||
|
||||
NbStack equ 76
|
||||
cur_match equ dword ptr[esp+NbStack-0]
|
||||
str_s equ dword ptr[esp+NbStack-4]
|
||||
; 5 dword on top (ret,ebp,esi,edi,ebx)
|
||||
adrret equ dword ptr[esp+NbStack-8]
|
||||
pushebp equ dword ptr[esp+NbStack-12]
|
||||
pushedi equ dword ptr[esp+NbStack-16]
|
||||
pushesi equ dword ptr[esp+NbStack-20]
|
||||
pushebx equ dword ptr[esp+NbStack-24]
|
||||
|
||||
chain_length equ dword ptr [esp+NbStack-28]
|
||||
limit equ dword ptr [esp+NbStack-32]
|
||||
best_len equ dword ptr [esp+NbStack-36]
|
||||
window equ dword ptr [esp+NbStack-40]
|
||||
prev equ dword ptr [esp+NbStack-44]
|
||||
scan_start equ word ptr [esp+NbStack-48]
|
||||
wmask equ dword ptr [esp+NbStack-52]
|
||||
match_start_ptr equ dword ptr [esp+NbStack-56]
|
||||
nice_match equ dword ptr [esp+NbStack-60]
|
||||
scan equ dword ptr [esp+NbStack-64]
|
||||
|
||||
windowlen equ dword ptr [esp+NbStack-68]
|
||||
match_start equ dword ptr [esp+NbStack-72]
|
||||
strend equ dword ptr [esp+NbStack-76]
|
||||
NbStackAdd equ (NbStack-24)
|
||||
|
||||
.386p
|
||||
|
||||
name gvmatch
|
||||
.MODEL FLAT
|
||||
|
||||
|
||||
|
||||
; all the +zlib1222add offsets are due to the addition of fields
|
||||
; in zlib in the deflate_state structure since the asm code was first written
|
||||
; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
|
||||
; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
|
||||
; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
|
||||
|
||||
zlib1222add equ 8
|
||||
|
||||
; Note : these value are good with a 8 bytes boundary pack structure
|
||||
dep_chain_length equ 74h+zlib1222add
|
||||
dep_window equ 30h+zlib1222add
|
||||
dep_strstart equ 64h+zlib1222add
|
||||
dep_prev_length equ 70h+zlib1222add
|
||||
dep_nice_match equ 88h+zlib1222add
|
||||
dep_w_size equ 24h+zlib1222add
|
||||
dep_prev equ 38h+zlib1222add
|
||||
dep_w_mask equ 2ch+zlib1222add
|
||||
dep_good_match equ 84h+zlib1222add
|
||||
dep_match_start equ 68h+zlib1222add
|
||||
dep_lookahead equ 6ch+zlib1222add
|
||||
|
||||
|
||||
_TEXT segment
|
||||
|
||||
IFDEF NOUNDERLINE
|
||||
public longest_match
|
||||
public match_init
|
||||
ELSE
|
||||
public _longest_match
|
||||
public _match_init
|
||||
ENDIF
|
||||
|
||||
MAX_MATCH equ 258
|
||||
MIN_MATCH equ 3
|
||||
MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
|
||||
|
||||
|
||||
|
||||
MAX_MATCH equ 258
|
||||
MIN_MATCH equ 3
|
||||
MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
|
||||
MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
|
||||
|
||||
|
||||
;;; stack frame offsets
|
||||
|
||||
chainlenwmask equ esp + 0 ; high word: current chain len
|
||||
; low word: s->wmask
|
||||
window equ esp + 4 ; local copy of s->window
|
||||
windowbestlen equ esp + 8 ; s->window + bestlen
|
||||
scanstart equ esp + 16 ; first two bytes of string
|
||||
scanend equ esp + 12 ; last two bytes of string
|
||||
scanalign equ esp + 20 ; dword-misalignment of string
|
||||
nicematch equ esp + 24 ; a good enough match size
|
||||
bestlen equ esp + 28 ; size of best match so far
|
||||
scan equ esp + 32 ; ptr to string wanting match
|
||||
|
||||
LocalVarsSize equ 36
|
||||
; saved ebx byte esp + 36
|
||||
; saved edi byte esp + 40
|
||||
; saved esi byte esp + 44
|
||||
; saved ebp byte esp + 48
|
||||
; return address byte esp + 52
|
||||
deflatestate equ esp + 56 ; the function arguments
|
||||
curmatch equ esp + 60
|
||||
|
||||
;;; Offsets for fields in the deflate_state structure. These numbers
|
||||
;;; are calculated from the definition of deflate_state, with the
|
||||
;;; assumption that the compiler will dword-align the fields. (Thus,
|
||||
;;; changing the definition of deflate_state could easily cause this
|
||||
;;; program to crash horribly, without so much as a warning at
|
||||
;;; compile time. Sigh.)
|
||||
|
||||
dsWSize equ 36+zlib1222add
|
||||
dsWMask equ 44+zlib1222add
|
||||
dsWindow equ 48+zlib1222add
|
||||
dsPrev equ 56+zlib1222add
|
||||
dsMatchLen equ 88+zlib1222add
|
||||
dsPrevMatch equ 92+zlib1222add
|
||||
dsStrStart equ 100+zlib1222add
|
||||
dsMatchStart equ 104+zlib1222add
|
||||
dsLookahead equ 108+zlib1222add
|
||||
dsPrevLen equ 112+zlib1222add
|
||||
dsMaxChainLen equ 116+zlib1222add
|
||||
dsGoodMatch equ 132+zlib1222add
|
||||
dsNiceMatch equ 136+zlib1222add
|
||||
|
||||
|
||||
;;; match686.asm -- Pentium-Pro-optimized version of longest_match()
|
||||
;;; Written for zlib 1.1.2
|
||||
;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
|
||||
;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
|
||||
;;;
|
||||
;;
|
||||
;; This software is provided 'as-is', without any express or implied
|
||||
;; warranty. In no event will the authors be held liable for any damages
|
||||
;; arising from the use of this software.
|
||||
;;
|
||||
;; Permission is granted to anyone to use this software for any purpose,
|
||||
;; including commercial applications, and to alter it and redistribute it
|
||||
;; freely, subject to the following restrictions:
|
||||
;;
|
||||
;; 1. The origin of this software must not be misrepresented; you must not
|
||||
;; claim that you wrote the original software. If you use this software
|
||||
;; in a product, an acknowledgment in the product documentation would be
|
||||
;; appreciated but is not required.
|
||||
;; 2. Altered source versions must be plainly marked as such, and must not be
|
||||
;; misrepresented as being the original software
|
||||
;; 3. This notice may not be removed or altered from any source distribution.
|
||||
;;
|
||||
|
||||
;GLOBAL _longest_match, _match_init
|
||||
|
||||
|
||||
;SECTION .text
|
||||
|
||||
;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
|
||||
|
||||
;_longest_match:
|
||||
IFDEF NOUNDERLINE
|
||||
longest_match proc near
|
||||
ELSE
|
||||
_longest_match proc near
|
||||
ENDIF
|
||||
.FPO (9, 4, 0, 0, 1, 0)
|
||||
|
||||
;;; Save registers that the compiler may be using, and adjust esp to
|
||||
;;; make room for our stack frame.
|
||||
|
||||
push ebp
|
||||
push edi
|
||||
push esi
|
||||
push ebx
|
||||
sub esp, LocalVarsSize
|
||||
|
||||
;;; Retrieve the function arguments. ecx will hold cur_match
|
||||
;;; throughout the entire function. edx will hold the pointer to the
|
||||
;;; deflate_state structure during the function's setup (before
|
||||
;;; entering the main loop.
|
||||
|
||||
mov edx, [deflatestate]
|
||||
mov ecx, [curmatch]
|
||||
|
||||
;;; uInt wmask = s->w_mask;
|
||||
;;; unsigned chain_length = s->max_chain_length;
|
||||
;;; if (s->prev_length >= s->good_match) {
|
||||
;;; chain_length >>= 2;
|
||||
;;; }
|
||||
|
||||
mov eax, [edx + dsPrevLen]
|
||||
mov ebx, [edx + dsGoodMatch]
|
||||
cmp eax, ebx
|
||||
mov eax, [edx + dsWMask]
|
||||
mov ebx, [edx + dsMaxChainLen]
|
||||
jl LastMatchGood
|
||||
shr ebx, 2
|
||||
LastMatchGood:
|
||||
|
||||
;;; chainlen is decremented once beforehand so that the function can
|
||||
;;; use the sign flag instead of the zero flag for the exit test.
|
||||
;;; It is then shifted into the high word, to make room for the wmask
|
||||
;;; value, which it will always accompany.
|
||||
|
||||
dec ebx
|
||||
shl ebx, 16
|
||||
or ebx, eax
|
||||
mov [chainlenwmask], ebx
|
||||
|
||||
;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
|
||||
|
||||
mov eax, [edx + dsNiceMatch]
|
||||
mov ebx, [edx + dsLookahead]
|
||||
cmp ebx, eax
|
||||
jl LookaheadLess
|
||||
mov ebx, eax
|
||||
LookaheadLess: mov [nicematch], ebx
|
||||
|
||||
;;; register Bytef *scan = s->window + s->strstart;
|
||||
|
||||
mov esi, [edx + dsWindow]
|
||||
mov [window], esi
|
||||
mov ebp, [edx + dsStrStart]
|
||||
lea edi, [esi + ebp]
|
||||
mov [scan], edi
|
||||
|
||||
;;; Determine how many bytes the scan ptr is off from being
|
||||
;;; dword-aligned.
|
||||
|
||||
mov eax, edi
|
||||
neg eax
|
||||
and eax, 3
|
||||
mov [scanalign], eax
|
||||
|
||||
;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
|
||||
;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
|
||||
|
||||
mov eax, [edx + dsWSize]
|
||||
sub eax, MIN_LOOKAHEAD
|
||||
sub ebp, eax
|
||||
jg LimitPositive
|
||||
xor ebp, ebp
|
||||
LimitPositive:
|
||||
|
||||
;;; int best_len = s->prev_length;
|
||||
|
||||
mov eax, [edx + dsPrevLen]
|
||||
mov [bestlen], eax
|
||||
|
||||
;;; Store the sum of s->window + best_len in esi locally, and in esi.
|
||||
|
||||
add esi, eax
|
||||
mov [windowbestlen], esi
|
||||
|
||||
;;; register ush scan_start = *(ushf*)scan;
|
||||
;;; register ush scan_end = *(ushf*)(scan+best_len-1);
|
||||
;;; Posf *prev = s->prev;
|
||||
|
||||
movzx ebx, word ptr [edi]
|
||||
mov [scanstart], ebx
|
||||
movzx ebx, word ptr [edi + eax - 1]
|
||||
mov [scanend], ebx
|
||||
mov edi, [edx + dsPrev]
|
||||
|
||||
;;; Jump into the main loop.
|
||||
|
||||
mov edx, [chainlenwmask]
|
||||
jmp short LoopEntry
|
||||
|
||||
align 4
|
||||
|
||||
;;; do {
|
||||
;;; match = s->window + cur_match;
|
||||
;;; if (*(ushf*)(match+best_len-1) != scan_end ||
|
||||
;;; *(ushf*)match != scan_start) continue;
|
||||
;;; [...]
|
||||
;;; } while ((cur_match = prev[cur_match & wmask]) > limit
|
||||
;;; && --chain_length != 0);
|
||||
;;;
|
||||
;;; Here is the inner loop of the function. The function will spend the
|
||||
;;; majority of its time in this loop, and majority of that time will
|
||||
;;; be spent in the first ten instructions.
|
||||
;;;
|
||||
;;; Within this loop:
|
||||
;;; ebx = scanend
|
||||
;;; ecx = curmatch
|
||||
;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
|
||||
;;; esi = windowbestlen - i.e., (window + bestlen)
|
||||
;;; edi = prev
|
||||
;;; ebp = limit
|
||||
|
||||
LookupLoop:
|
||||
and ecx, edx
|
||||
movzx ecx, word ptr [edi + ecx*2]
|
||||
cmp ecx, ebp
|
||||
jbe LeaveNow
|
||||
sub edx, 00010000h
|
||||
js LeaveNow
|
||||
LoopEntry: movzx eax, word ptr [esi + ecx - 1]
|
||||
cmp eax, ebx
|
||||
jnz LookupLoop
|
||||
mov eax, [window]
|
||||
movzx eax, word ptr [eax + ecx]
|
||||
cmp eax, [scanstart]
|
||||
jnz LookupLoop
|
||||
|
||||
;;; Store the current value of chainlen.
|
||||
|
||||
mov [chainlenwmask], edx
|
||||
|
||||
;;; Point edi to the string under scrutiny, and esi to the string we
|
||||
;;; are hoping to match it up with. In actuality, esi and edi are
|
||||
;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
|
||||
;;; initialized to -(MAX_MATCH_8 - scanalign).
|
||||
|
||||
mov esi, [window]
|
||||
mov edi, [scan]
|
||||
add esi, ecx
|
||||
mov eax, [scanalign]
|
||||
mov edx, 0fffffef8h; -(MAX_MATCH_8)
|
||||
lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
|
||||
lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
|
||||
|
||||
;;; Test the strings for equality, 8 bytes at a time. At the end,
|
||||
;;; adjust edx so that it is offset to the exact byte that mismatched.
|
||||
;;;
|
||||
;;; We already know at this point that the first three bytes of the
|
||||
;;; strings match each other, and they can be safely passed over before
|
||||
;;; starting the compare loop. So what this code does is skip over 0-3
|
||||
;;; bytes, as much as necessary in order to dword-align the edi
|
||||
;;; pointer. (esi will still be misaligned three times out of four.)
|
||||
;;;
|
||||
;;; It should be confessed that this loop usually does not represent
|
||||
;;; much of the total running time. Replacing it with a more
|
||||
;;; straightforward "rep cmpsb" would not drastically degrade
|
||||
;;; performance.
|
||||
|
||||
LoopCmps:
|
||||
mov eax, [esi + edx]
|
||||
xor eax, [edi + edx]
|
||||
jnz LeaveLoopCmps
|
||||
mov eax, [esi + edx + 4]
|
||||
xor eax, [edi + edx + 4]
|
||||
jnz LeaveLoopCmps4
|
||||
add edx, 8
|
||||
jnz LoopCmps
|
||||
jmp short LenMaximum
|
||||
LeaveLoopCmps4: add edx, 4
|
||||
LeaveLoopCmps: test eax, 0000FFFFh
|
||||
jnz LenLower
|
||||
add edx, 2
|
||||
shr eax, 16
|
||||
LenLower: sub al, 1
|
||||
adc edx, 0
|
||||
|
||||
;;; Calculate the length of the match. If it is longer than MAX_MATCH,
|
||||
;;; then automatically accept it as the best possible match and leave.
|
||||
|
||||
lea eax, [edi + edx]
|
||||
mov edi, [scan]
|
||||
sub eax, edi
|
||||
cmp eax, MAX_MATCH
|
||||
jge LenMaximum
|
||||
|
||||
;;; If the length of the match is not longer than the best match we
|
||||
;;; have so far, then forget it and return to the lookup loop.
|
||||
|
||||
mov edx, [deflatestate]
|
||||
mov ebx, [bestlen]
|
||||
cmp eax, ebx
|
||||
jg LongerMatch
|
||||
mov esi, [windowbestlen]
|
||||
mov edi, [edx + dsPrev]
|
||||
mov ebx, [scanend]
|
||||
mov edx, [chainlenwmask]
|
||||
jmp LookupLoop
|
||||
|
||||
;;; s->match_start = cur_match;
|
||||
;;; best_len = len;
|
||||
;;; if (len >= nice_match) break;
|
||||
;;; scan_end = *(ushf*)(scan+best_len-1);
|
||||
|
||||
LongerMatch: mov ebx, [nicematch]
|
||||
mov [bestlen], eax
|
||||
mov [edx + dsMatchStart], ecx
|
||||
cmp eax, ebx
|
||||
jge LeaveNow
|
||||
mov esi, [window]
|
||||
add esi, eax
|
||||
mov [windowbestlen], esi
|
||||
movzx ebx, word ptr [edi + eax - 1]
|
||||
mov edi, [edx + dsPrev]
|
||||
mov [scanend], ebx
|
||||
mov edx, [chainlenwmask]
|
||||
jmp LookupLoop
|
||||
|
||||
;;; Accept the current string, with the maximum possible length.
|
||||
|
||||
LenMaximum: mov edx, [deflatestate]
|
||||
mov dword ptr [bestlen], MAX_MATCH
|
||||
mov [edx + dsMatchStart], ecx
|
||||
|
||||
;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
|
||||
;;; return s->lookahead;
|
||||
|
||||
LeaveNow:
|
||||
mov edx, [deflatestate]
|
||||
mov ebx, [bestlen]
|
||||
mov eax, [edx + dsLookahead]
|
||||
cmp ebx, eax
|
||||
jg LookaheadRet
|
||||
mov eax, ebx
|
||||
LookaheadRet:
|
||||
|
||||
;;; Restore the stack and return from whence we came.
|
||||
|
||||
add esp, LocalVarsSize
|
||||
pop ebx
|
||||
pop esi
|
||||
pop edi
|
||||
pop ebp
|
||||
|
||||
ret
|
||||
; please don't remove this string !
|
||||
; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary!
|
||||
db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
|
||||
|
||||
|
||||
IFDEF NOUNDERLINE
|
||||
longest_match endp
|
||||
ELSE
|
||||
_longest_match endp
|
||||
ENDIF
|
||||
|
||||
IFDEF NOUNDERLINE
|
||||
match_init proc near
|
||||
ret
|
||||
match_init endp
|
||||
ELSE
|
||||
_match_init proc near
|
||||
ret
|
||||
_match_init endp
|
||||
ENDIF
|
||||
|
||||
|
||||
_TEXT ends
|
||||
end
|
|
@ -1,27 +0,0 @@
|
|||
|
||||
Summary
|
||||
-------
|
||||
This directory contains ASM implementations of the functions
|
||||
longest_match() and inflate_fast().
|
||||
|
||||
|
||||
Use instructions
|
||||
----------------
|
||||
Assemble using MASM, and copy the object files into the zlib source
|
||||
directory, then run the appropriate makefile, as suggested below. You can
|
||||
donwload MASM from here:
|
||||
|
||||
http://www.microsoft.com/downloads/details.aspx?displaylang=en&FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
|
||||
|
||||
You can also get objects files here:
|
||||
|
||||
http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
|
||||
|
||||
Build instructions
|
||||
------------------
|
||||
* With Microsoft C and MASM:
|
||||
nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj"
|
||||
|
||||
* With Borland C and TASM:
|
||||
make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" OBJPA="+match686c.obj+match686.obj+inffas32.obj"
|
||||
|
|
@ -3,7 +3,6 @@
|
|||
#
|
||||
# Usage:
|
||||
# make -f win32/Makefile.bor
|
||||
# make -f win32/Makefile.bor LOCAL_ZLIB=-DASMV OBJA=match.obj OBJPA=+match.obj
|
||||
|
||||
# ------------ Borland C++ ------------
|
||||
|
||||
|
|
|
@ -11,10 +11,6 @@
|
|||
#
|
||||
# make -fwin32/Makefile.gcc; make test testdll -fwin32/Makefile.gcc
|
||||
#
|
||||
# To use the asm code, type:
|
||||
# cp contrib/asm?86/match.S ./match.S
|
||||
# make LOC=-DASMV OBJA=match.o -fwin32/Makefile.gcc
|
||||
#
|
||||
# To install libz.a, zconf.h and zlib.h in the system directories, type:
|
||||
#
|
||||
# make install -fwin32/Makefile.gcc
|
||||
|
@ -38,7 +34,6 @@ IMPLIB = libz.dll.a
|
|||
#
|
||||
SHARED_MODE=0
|
||||
|
||||
#LOC = -DASMV
|
||||
#LOC = -DZLIB_DEBUG -g
|
||||
|
||||
PREFIX =
|
||||
|
|
|
@ -4,10 +4,6 @@
|
|||
# Usage:
|
||||
# nmake -f win32/Makefile.msc (standard build)
|
||||
# nmake -f win32/Makefile.msc LOC=-DFOO (nonstandard build)
|
||||
# nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" \
|
||||
# OBJA="inffas32.obj match686.obj" (use ASM code, x86)
|
||||
# nmake -f win32/Makefile.msc AS=ml64 LOC="-DASMV -DASMINF -I." \
|
||||
# OBJA="inffasx64.obj gvmat64.obj inffas8664.obj" (use ASM code, x64)
|
||||
|
||||
# The toplevel directory of the source tree.
|
||||
#
|
||||
|
|
Loading…
Reference in New Issue