Added compress-4.0

git-svn-id: file:///srv/svn/repos/haiku/trunk/current@7815 a95241bf-73f2-0310-859d-f6bbb57e9c96
2004-06-07 21:23:10 +00:00 · 2004-06-07 21:23:10 +00:00 · 8bcb00aa88
commit 8bcb00aa88
parent 2defe47a24
5 changed files with 2412 additions and 0 deletions
--- a/src/apps/bin/compress/Jamfile
+++ b/src/apps/bin/compress/Jamfile
@ -0,0 +1,7 @@
+SubDir OBOS_TOP src apps bin compress ;
+
+SubDirCcFlags -DSACREDMEM=256000 -D_FPOS_T -DUSERMEM=2097152 -DBEOS=1 ;
+
+BinCommand compress : 
+	compress.c
+	;
--- a/src/apps/bin/compress/README
+++ b/src/apps/bin/compress/README
@ -0,0 +1,283 @@
+
+	@(#)README 1.1 86/09/25 SMI; from UCB 5.3 85/09/17
+
+Compress version 4.0 improvements over 3.0:
+	o compress() speedup (10-50%) by changing division hash to xor
+	o decompress() speedup (5-10%)
+	o Memory requirements reduced (3-30%)
+	o Stack requirements reduced to less than 4kb
+	o Removed 'Big+Fast' compress code (FBITS) because of compress speedup
+    	o Portability mods for Z8000 and PC/XT (but not zeus 3.2)
+	o Default to 'quiet' mode
+	o Unification of 'force' flags
+	o Manual page overhaul
+	o Portability enhancement for M_XENIX
+	o Removed text on #else and #endif
+	o Added "-V" switch to print version and options
+	o Added #defines for SIGNED_COMPARE_SLOW
+	o Added Makefile and "usermem" program
+	o Removed all floating point computations
+	o New programs: [deleted]
+
+The "usermem" script attempts to determine the maximum process size.  Some
+editing of the script may be necessary (see the comments).  [It should work
+fine on 4.3 bsd.] If you can't get it to work at all, just create file
+"USERMEM" containing the maximum process size in decimal.
+
+The following preprocessor symbols control the compilation of "compress.c":
+
+	o USERMEM		Maximum process memory on the system
+	o SACREDMEM		Amount to reserve for other proceses
+	o SIGNED_COMPARE_SLOW	Unsigned compare instructions are faster
+	o NO_UCHAR		Don't use "unsigned char" types
+	o BITS			Overrules default set by USERMEM-SACREDMEM
+	o vax			Generate inline assembler
+	o interdata		Defines SIGNED_COMPARE_SLOW
+	o M_XENIX		Makes arrays < 65536 bytes each
+	o pdp11			BITS=12, NO_UCHAR
+	o z8000			BITS=12
+	o pcxt			BITS=12
+	o BSD4_2		Allow long filenames ( > 14 characters) &
+				Call setlinebuf(stderr)
+
+The difference "usermem-sacredmem" determines the maximum BITS that can be
+specified with the "-b" flag.
+
+memory: at least		BITS
+------  -- -----                ----
+     433,484			 16
+     229,600			 15
+     127,536			 14
+      73,464			 13
+           0			 12
+
+The default is BITS=16.
+
+The maximum bits can be overrulled by specifying "-DBITS=bits" at
+compilation time.
+
+WARNING: files compressed on a large machine with more bits than allowed by 
+a version of compress on a smaller machine cannot be decompressed!  Use the
+"-b12" flag to generate a file on a large machine that can be uncompressed 
+on a 16-bit machine.
+
+The output of compress 4.0 is fully compatible with that of compress 3.0.
+In other words, the output of compress 4.0 may be fed into uncompress 3.0 or
+the output of compress 3.0 may be fed into uncompress 4.0.
+
+The output of compress 4.0 not compatible with that of
+compress 2.0.  However, compress 4.0 still accepts the output of
+compress 2.0.  To generate output that is compatible with compress
+2.0, use the undocumented "-C" flag.
+
+	-from mod.sources, submitted by vax135!petsd!joe (Joe Orost), 8/1/85
+--------------------------------
+
+Enclosed is compress version 3.0 with the following changes:
+
+1.	"Block" compression is performed.  After the BITS run out, the
+	compression ratio is checked every so often.  If it is decreasing,
+	the table is cleared and a new set of substrings are generated.
+
+	This makes the output of compress 3.0 not compatible with that of
+	compress 2.0.  However, compress 3.0 still accepts the output of
+	compress 2.0.  To generate output that is compatible with compress
+	2.0, use the undocumented "-C" flag.
+
+2.	A quiet "-q" flag has been added for use by the news system.
+
+3.	The character chaining has been deleted and the program now uses
+	hashing.  This improves the speed of the program, especially
+	during decompression.  Other speed improvements have been made,
+	such as using putc() instead of fwrite().
+
+4.	A large table is used on large machines when a relatively small
+	number of bits is specified.  This saves much time when compressing
+	for a 16-bit machine on a 32-bit virtual machine.  Note that the
+	speed improvement only occurs when the input file is > 30000
+	characters, and the -b BITS is less than or equal to the cutoff
+	described below.
+
+Most of these changes were made by James A. Woods (ames!jaw).  Thank you
+James!
+
+To compile compress:
+
+	cc -O -DUSERMEM=usermem -o compress compress.c
+
+Where "usermem" is the amount of physical user memory available (in bytes).  
+If any physical memory is to be reserved for other processes, put in 
+"-DSACREDMEM sacredmem", where "sacredmem" is the amount to be reserved.
+
+The difference "usermem-sacredmem" determines the maximum BITS that can be
+specified, and the cutoff bits where the large+fast table is used.
+
+memory: at least		BITS		cutoff
+------  -- -----                ----            ------
+   4,718,592 			 16		  13
+   2,621,440 			 16		  12
+   1,572,864			 16		  11
+   1,048,576			 16		  10
+     631,808			 16               --
+     329,728			 15               --
+     178,176			 14		  --
+      99,328			 13		  --
+           0			 12		  --
+
+The default memory size is 750,000 which gives a maximum BITS=16 and no
+large+fast table.
+
+The maximum bits can be overruled by specifying "-DBITS=bits" at
+compilation time.
+
+If your machine doesn't support unsigned characters, define "NO_UCHAR" 
+when compiling.
+
+If your machine has "int" as 16-bits, define "SHORT_INT" when compiling.
+
+After compilation, move "compress" to a standard executable location, such 
+as /usr/local.  Then:
+	cd /usr/local
+	ln compress uncompress
+	ln compress zcat
+
+On machines that have a fixed stack size (such as Perkin-Elmer), set the
+stack to at least 12kb.  ("setstack compress 12" on Perkin-Elmer).
+
+Next, install the manual (compress.l).
+	cp compress.l /usr/man/manl
+	cd /usr/man/manl
+	ln compress.l uncompress.l
+	ln compress.l zcat.l
+
+		- or -
+
+	cp compress.l /usr/man/man1/compress.1
+	cd /usr/man/man1
+	ln compress.1 uncompress.1
+	ln compress.1 zcat.1
+
+					regards,
+					petsd!joe
+
+Here is a note from the net:
+
+>From hplabs!pesnta!amd!turtlevax!ken Sat Jan  5 03:35:20 1985
+Path: ames!hplabs!pesnta!amd!turtlevax!ken
+From: ken@turtlevax.UUCP (Ken Turkowski)
+Newsgroups: net.sources
+Subject: Re: Compress release 3.0 : sample Makefile
+Organization: CADLINC, Inc. @ Menlo Park, CA
+
+In the compress 3.0 source recently posted to mod.sources, there is a
+#define variable which can be set for optimum performance on a machine
+with a large amount of memory.  A program (usermem) to calculate the
+useable amount of physical user memory is enclosed, as well as a sample
+4.2bsd Vax Makefile for compress.
+
+Here is the README file from the previous version of compress (2.0):
+
+>Enclosed is compress.c version 2.0 with the following bugs fixed:
+>
+>1.	The packed files produced by compress are different on different
+>	machines and dependent on the vax sysgen option.
+>		The bug was in the different byte/bit ordering on the
+>		various machines.  This has been fixed.
+>
+>		This version is NOT compatible with the original vax posting
+>		unless the '-DCOMPATIBLE' option is specified to the C
+>		compiler.  The original posting has a bug which I fixed, 
+>		causing incompatible files.  I recommend you NOT to use this
+>		option unless you already have a lot of packed files from
+>		the original posting by thomas.
+>2.	The exit status is not well defined (on some machines) causing the
+>	scripts to fail.
+>		The exit status is now 0,1 or 2 and is documented in
+>		compress.l.
+>3.	The function getopt() is not available in all C libraries.
+>		The function getopt() is no longer referenced by the
+>		program.
+>4.	Error status is not being checked on the fwrite() and fflush() calls.
+>		Fixed.
+>
+>The following enhancements have been made:
+>
+>1.	Added facilities of "compact" into the compress program.  "Pack",
+>	"Unpack", and "Pcat" are no longer required (no longer supplied).
+>2.	Installed work around for C compiler bug with "-O".
+>3.	Added a magic number header (\037\235).  Put the bits specified
+>	in the file.
+>4.	Added "-f" flag to force overwrite of output file.
+>5.	Added "-c" flag and "zcat" program.  'ln compress zcat' after you
+>	compile.
+>6.	The 'uncompress' script has been deleted; simply 
+>	'ln compress uncompress' after you compile and it will work.
+>7.	Removed extra bit masking for machines that support unsigned
+>	characters.  If your machine doesn't support unsigned characters,
+>	define "NO_UCHAR" when compiling.
+>
+>Compile "compress.c" with "-O -o compress" flags.  Move "compress" to a
+>standard executable location, such as /usr/local.  Then:
+>	cd /usr/local
+>	ln compress uncompress
+>	ln compress zcat
+>
+>On machines that have a fixed stack size (such as Perkin-Elmer), set the
+>stack to at least 12kb.  ("setstack compress 12" on Perkin-Elmer).
+>
+>Next, install the manual (compress.l).
+>	cp compress.l /usr/man/manl		- or -
+>	cp compress.l /usr/man/man1/compress.1
+>
+>Here is the README that I sent with my first posting:
+>
+>>Enclosed is a modified version of compress.c, along with scripts to make it
+>>run identically to pack(1), unpack(1), an pcat(1).  Here is what I
+>>(petsd!joe) and a colleague (petsd!peora!srd) did:
+>>
+>>1. Removed VAX dependencies.
+>>2. Changed the struct to separate arrays; saves mucho memory.
+>>3. Did comparisons in unsigned, where possible.  (Faster on Perkin-Elmer.)
+>>4. Sorted the character next chain and changed the search to stop
+>>prematurely.  This saves a lot on the execution time when compressing.
+>>
+>>This version is totally compatible with the original version.  Even though
+>>lint(1) -p has no complaints about compress.c, it won't run on a 16-bit
+>>machine, due to the size of the arrays.
+>>
+>>Here is the README file from the original author:
+>> 
+>>>Well, with all this discussion about file compression (for news batching
+>>>in particular) going around, I decided to implement the text compression
+>>>algorithm described in the June Computer magazine.  The author claimed
+>>>blinding speed and good compression ratios.  It's certainly faster than
+>>>compact (but, then, what wouldn't be), but it's also the same speed as
+>>>pack, and gets better compression than both of them.  On 350K bytes of
+>>>unix-wizards, compact took about 8 minutes of CPU, pack took about 80
+>>>seconds, and compress (herein) also took 80 seconds.  But, compact and
+>>>pack got about 30% compression, whereas compress got over 50%.  So, I
+>>>decided I had something, and that others might be interested, too.
+>>>
+>>>As is probably true of compact and pack (although I haven't checked),
+>>>the byte order within a word is probably relevant here, but as long as
+>>>you stay on a single machine type, you should be ok.  (Can anybody
+>>>elucidate on this?)  There are a couple of asm's in the code (extv and
+>>>insv instructions), so anyone porting it to another machine will have to
+>>>deal with this anyway (and could probably make it compatible with Vax
+>>>byte order at the same time).  Anyway, I've linted the code (both with
+>>>and without -p), so it should run elsewhere.  Note the longs in the
+>>>code, you can take these out if you reduce BITS to <= 15.
+>>>
+>>>Have fun, and as always, if you make good enhancements, or bug fixes,
+>>>I'd like to see them.
+>>>
+>>>=Spencer (thomas@utah-20, {harpo,hplabs,arizona}!utah-cs!thomas)
+>>
+>>					regards,
+>>					joe
+>>
+>>--
+>>Full-Name:  Joseph M. Orost
+>>UUCP:       ..!{decvax,ucbvax,ihnp4}!vax135!petsd!joe
+>>US Mail:    MS 313; Perkin-Elmer; 106 Apple St; Tinton Falls, NJ 07724
+>>Phone:      (201) 870-5844
--- a/src/apps/bin/compress/compress.1
+++ b/src/apps/bin/compress/compress.1
@ -0,0 +1,264 @@
+.PU
+.TH COMPRESS 1 local
+.SH NAME
+compress, uncompress, zcat  \-  compress and uncompress files
+.SH SYNOPSIS
+.ll +8
+.B compress
+[
+.B \-c
+] [
+.B \-C
+] [
+.B \-d
+] [
+.B \-f
+] [
+.B \-v
+] [
+.B \-b
+.I bits
+] [
+.I "filename \&..."
+]
+.ll -8
+.br
+.B uncompress
+[
+.B \-c
+] [
+.B \-f
+] [
+.B \-v
+] [
+.B \-V
+] [
+.I "filename \&..."
+]
+.br
+.B zcat
+[
+.I "filename \&..."
+]
+.SH DESCRIPTION
+Compresses the specified files or standard input.
+Each file is replaced by a file with the extension
+.B "\&.Z,"
+but only if the file got smaller.
+If no files are specified,
+the compression is applied to the standard input
+and is written to standard output
+regardless of the results.
+Compressed files can be restored
+to their original form by specifying the
+.B \-d
+option, or by running
+.I uncompress
+(linked to
+.IR compress ),
+on the 
+.B "\&.Z"
+files or the standard input.
+.PP
+If the output file exists, it will not be overwritten unless the
+.B \-f
+flag is given.  If
+.B \-f
+is not specified and
+.I compress
+is run in the foreground,
+the user is prompted
+as to whether the file should be overwritten.
+.PP
+If the
+.B \-f
+flag is given, all files specified are replaced with
+.B "\&.Z"
+files \- even if the file didn't get smaller.
+.PP
+When file names are given, the ownership (if run by root), modes, accessed
+and modified times are maintained between the file and its 
+.B "\&.Z"
+version.  In this respect,
+.I compress
+can be used for archival purposes, yet can still be used with
+.IR make "(1)"
+after uncompression.
+.PP
+The
+.B \-c
+option causes the results of the compress/uncompress operation to be written
+to stdout; no files are changed.  The
+.I zcat
+program is the same as specifying
+.B \-c
+to
+.I uncompress
+(all files are unpacked and written to stdout).
+.PP
+.I Compress
+uses the modified Lempel-Ziv algorithm described in
+"A Technique for High Performance Data Compression",
+Terry A. Welch,
+.I "IEEE Computer"
+Vol 17, No 6 (June 1984), pp 8-19.
+Common substrings in the file are first replaced by 9-bit codes 257 and up.
+When code 512 is reached, the algorithm switches to 10-bit codes and
+continues to use more bits until the
+.I bits
+limit as specified by the
+.B \-b
+flag is reached (default 16).
+.I Bits
+must be between 9 and 16.  The default can be changed in the source to allow
+.I compress
+to be run on a smaller machine.
+.PP
+After the
+.I bits
+limit is reached,
+.I compress
+periodically checks the compression ratio.  If it is increasing,
+.I compress
+continues to use the codes that were previously found in the file.  However,
+if the compression ratio decreases,
+.I compress
+discards the table of substrings and rebuilds it from scratch.  This allows
+the algorithm to adapt to the next "block" of the file.  The
+.B \-C
+(compatibility) flag prevents subdivision of the file into blocks;
+this produces an output file that old versions of 
+.I compress
+can read.
+.PP
+A two byte magic number is prepended to the file
+to ensure that neither uncompression of random text nor recompression of 
+compressed text are attempted.  In addition, the
+.I bits
+specified during
+.I compress
+is written to the file so that the
+.B \-b
+flag can be omitted for
+.IR uncompress \.
+.PP
+.ne 8
+The amount of compression obtained depends on the size of the
+input file, the amount of
+.I bits
+per code, and the distribution of character substrings.
+Typically, text files, such as C programs,
+are reduced by 50\-60%.
+Compression is generally much better than that achieved by
+Huffman coding (as used in
+.IR pack ),
+or adaptive Huffman coding
+.RI ( compact ),
+and takes less time to compute.
+.PP
+.PP
+If the
+.B \-v
+(verbose) flag is given, then
+after each file is compressed, a message is printed giving the percentage of
+the input file that has been saved by compression.
+.PP
+If the
+.B \-V
+(version) flag is given, the program's version number is printed.
+.PP
+The exit status is normally 0;
+if the last file gets bigger after compression, the exit status is 2;
+if an error occurs, the exit status is 1.
+.SH "SEE ALSO"
+compact(1), pack(1)
+.SH "DIAGNOSTICS"
+Usage: compress [-cCdfvV] [-b maxbits] [file ...]
+.in +8
+Invalid options were specified on the command line.
+.in -8
+Missing maxbits
+.in +8
+Maxbits must follow
+.BR \-b \.
+.in -8
+Unknown flag: 
+.I "\'x\';"
+.in +8
+Invalid flags were specified on the command line.
+.in -8
+.IR file :
+not in compressed format
+.in +8
+The specified file has not been compressed.
+.in -8
+.IR file :
+compressed with 
+.I xx
+bits, can only handle 
+.I yy
+bits
+.in +8
+The specified file was compressed by a compress program that could handle
+more 
+.I bits
+than the current compress program.  Recompress the file with a smaller
+.IR bits \.
+.in -8
+.IR file :
+already has .Z suffix -- no change
+.in +8
+Cannot compress a file that has a ".Z" suffix.
+.IR mv "(1)"
+the file to a different name and try again.
+.in -8
+.IR file :
+filename too long to tack on .Z
+.in +8
+The specified file cannot be compressed because its filename is longer than
+12 characters.
+.IR mv "(1)"
+the file to a different name and try again.  This message does not occur on
+4.2BSD systems.
+.in -8
+.I file
+already exists; do you wish to overwrite (y or n)?
+.in +8
+Respond "y" if you want the output file to be replaced; "n" if you want it
+to be left alone.
+.in -8
+.IR file :
+.in +8
+This message fragment is written during the processing of a file.
+.in -8
+Compression: 
+.I "xx.xx%"
+.in +8
+This message fragment gives the percentage of the input file that has been
+saved by compression.
+.in -8
+-- not a regular file: unchanged
+.in +8
+This message fragment is written when the input file is not a regular file.
+The input file is left unchanged.
+.in -8
+-- has 
+.I xx 
+other links: unchanged
+.in +8
+This message fragment is written when the input file has links.  The input
+file is left unchanged.  See
+.IR ln "(1)"
+for more information.
+.in -8
+-- file unchanged
+.in +8
+This message fragment is written when no savings are achieved by
+compression.  The input file is left unchanged.
+.in -8
+-- replaced with 
+.I file
+.in +8
+This message fragment is written when a file has been sucessfully
+compressed/uncompressed.
+.in -8
--- a/src/apps/bin/compress/compress.c
+++ b/src/apps/bin/compress/compress.c
--- a/src/apps/bin/compress/usermem.sh
+++ b/src/apps/bin/compress/usermem.sh
@ -0,0 +1,83 @@
+#! /bin/sh
+#
+#	@(#)usermem.sh 1.1 86/09/25 SMI; from UCB 5.4 85/09/17
+#
+: This shell script snoops around to find the maximum amount of available
+: user memory.  These variables need to be set only if there is no
+: /usr/adm/messages.  KMEM, UNIX, and CLICKSIZE can be set on the command
+: line, if desired, e.g. UNIX=/unix
+KMEM=/dev/kmem		# User needs read access to KMEM
+UNIX=
+# VAX			CLICKSIZE=512,	UNIX=/vmunix
+# PDP-11		CLICKSIZE=64,	UNIX=/unix
+# CADLINC 68000		CLICKSIZE=4096,	UNIX=/unix
+# Perkin-Elmer 3205	CLICKSIZE=4096,	UNIX=/edition7
+# Perkin-Elmer all others, CLICKSIZE=2048, UNIX=/edition7
+CLICKSIZE=512
+eval $*
+
+if test -n "$UNIX"
+then
+    : User must have specified it already.
+elif test -r /vmunix
+then
+    UNIX=/vmunix
+    if [ -r /bin/sun2 ] && /bin/sun2
+    then
+    	CLICKSIZE=2048	# Sun-2
+    elif [ -r /bin/sun3 ] && /bin/sun3
+    then
+	CLICKSIZE=8192	# Sun-3
+    else
+	CLICKSIZE=512	# Probably VAX
+    fi
+elif test -r /edition7
+then
+    UNIX=/edition7
+    CLICKSIZE=2048	# Perkin-Elmer: change to 4096 on a 3205
+elif test -r /unix
+then
+    UNIX=/unix		# Could be anything
+fi
+
+SIZE=0
+# messages: probably the most transportable
+if test -r /usr/adm/messages -a -s /usr/adm/messages
+then
+    SIZE=`grep avail /usr/adm/messages | sed -n '$s/.*[ 	]//p'`
+fi
+
+if test 0$SIZE -le 0		# no SIZE in /usr/adm/messages
+then
+    if test -r $KMEM		# Readable KMEM
+    then
+	if test -n "$UNIX"
+	then
+	    SIZE=`echo maxmem/D | adb $UNIX $KMEM | sed -n '$s/.*[ 	]//p'`
+	    if test 0$SIZE -le 0
+	    then
+		SIZE=`echo physmem/D | adb $UNIX $KMEM | sed -n '$s/.*[ 	]//p'`
+	    fi
+	    SIZE=`expr 0$SIZE '*' $CLICKSIZE`
+	fi
+    fi
+fi
+
+case $UNIX in
+    /vmunix)		# Assume 4.2bsd: check for resource limits
+	MAXSIZE=`csh -c limit | awk 'BEGIN	{ MAXSIZE = 1000000 }
+/datasize|memoryuse/ && NF == 3	{ if ($2 < MAXSIZE) MAXSIZE = $2 }
+END	{ print MAXSIZE * 1000 }'`
+	if test $MAXSIZE -lt $SIZE
+	then
+	    SIZE=$MAXSIZE
+	fi
+	;;
+esac
+
+if test 0$SIZE -le 0
+then
+    echo 0;exit 1
+else
+    echo $SIZE
+fi