diff --git a/usr.bin/sort/Makefile b/usr.bin/sort/Makefile new file mode 100644 index 000000000000..76abdebb7e2d --- /dev/null +++ b/usr.bin/sort/Makefile @@ -0,0 +1,6 @@ +# @(#)Makefile 8.1 (Berkeley) 6/6/93 + +PROG= sort +SRCS= append.c fields.c files.c fsort.c init.c msort.c sort.c tmp.c + +.include diff --git a/usr.bin/sort/TEST/stests b/usr.bin/sort/TEST/stests new file mode 100644 index 000000000000..40ea4bf21f00 --- /dev/null +++ b/usr.bin/sort/TEST/stests @@ -0,0 +1,893 @@ +# @(#)stests 8.1 (Berkeley) 6/6/93 + +#Latest version. My sort passes all tests because I wrote it. +#We differ only on 25E and 25H. +#(I found at least one bug in constructing test 25, and was driven +#to rewrite field parsing to clarify it.) +# +#In 25E, -k2.3,2.1b, the fields are not necessarily out of order. +#Even if they were, it would be legal (11752-3), although certainly +#justification for warning. +# +#On 25H, your answer is as defensible as mine. (Our suggestion +#*1 backs mine.) + + +# Tests for the Unix sort utility +# Test Posix features except for locale. +# Test some nonstandard features if present. + +# Other tests should be made for files too big to fit in memory. + + +# Initialize switches for nonstandard features. +# Use parenthesized settings for supported features. + +o=: # officially obsolescent features: +1 -2, misplaced -o (o=) +g=: # -g numeric sort including e-format numbers (g=) +M=: # -M sort by month names (M=) +s=: # -s stable, do not compare raw bytes on equal keys (s=) +y= # -y user-specified memory size (y=-y10000) + +# Detect what features are supported, assuming bad options cause +# errors. Set switches accordingly. + +echo obsolescent and nonstandard features recognized, if any: +if sort +0 /dev/null; then o= + echo ' +1 -2'; fi +if sort /dev/null -o xx 2>/dev/null; then o= + echo ' displaced -o'; fi +if sort -g /dev/null; then g= + echo ' -g g-format numbers'; fi +if sort -M /dev/null; then M= + echo ' -M months'; fi +if sort -s /dev/null; then s= + echo ' -s stable'; fi +if sort -y10000 /dev/null; then y=-y10000 + echo ' -y space'; fi +if sort -z10000 /dev/null; then + echo ' -z size (not exercised)'; fi +if sort -T. /dev/null; then + echo ' -T tempdir (not exercised)'; fi + + +export TEST # major sequence number of test + +trap "rm -f in in1 out xx -k xsort linecount fields; exit" 0 1 2 13 15 + +# xsort testno options +# Sort file "in" with specified options. +# Compare with file "out" if that is supplied, +# otherwise make plausibility checks on output + +# "sum" must be dumb; insensitive to the +# order of lines within a file. +# System V sum is suitable; sum -5 is the v10 equivalent. + +PATH=.:$PATH +export PATH +cat <<'!' >xsort; chmod +x xsort + + X=$1; shift + + if sort "$@" in >xx && sort -c "$@" xx + then + if test -f out + then + cmp xx out >/dev/null && exit 0 + echo $TEST$X comparison failed + else + test "`cksum -o2 linecount; chmod +x linecount +awk 'END{ if(NR!='$3') print "'$TEST$1' failed" }' $2 +! + +rm -f out + +#--------------------------------------------------------------- +TEST=01; echo $TEST # -c status, checksum + # obsolescent features go together +cat <in +b +a +! +rm -f out -o + +sort -c in 2>/dev/null && echo ${TEST}A failed + +xsort B || '"cksum"' is probably unsuitable - see comments + +$o sort +0 in -o in || echo ${TEST}c failed + +#--------------------------------------------------------------- +TEST=02; echo $TEST # output from -c +cat <in +x +y +! + +sort -cr in >out 2>xx && echo ${TEST}A failed +test -s out && echo ${TEST}B failed +test -s xx && echo option -c is noisy "(probably legal)" +test -s xx || echo option -c is quiet "(legal, not classical)" + +#--------------------------------------------------------------- +TEST=03; echo $TEST # -n +cat <in +-99.0 +-99.1 +-.0002 +-10 +2 +0010.000000000000000000000000000000000001 +10 +3x +x +! +cat <out +-99.1 +-99.0 +-10 +-.0002 +x +2 +3x +10 +0010.000000000000000000000000000000000001 +! + +xsort "" -n + +#--------------------------------------------------------------- +TEST=04; echo $TEST # -b without fields, piping, -c status return +cat <in + b + a +! +cp in out + +xsort A -b + +cat in | sort | cat >xx +cmp xx out >/dev/null || echo ${TEST}B failed + +sort in | sort -cr 2>/dev/null && echo ${TEST}C failed + +#--------------------------------------------------------------- +TEST=05; echo $TEST # fields, reverse fields, -c status return +cat <in +b b p +a b q +x a +! +cat <out +x a +a b q +b b p +! + +$o xsort A +1 -2 + +$o xsort B +1 -2 +2r + +xsort C -k 2,2 + +xsort D -k 2,2 -k 3r + +xsort E -k 2,2.0 + +xsort F -k 2,2 -k 1,1 -k 3 + +sort -c -k 2 in 2>/dev/null && ${TEST}G failed + +#--------------------------------------------------------------- +TEST=06; echo $TEST # -t +cat <in +a: +a! +! +cp in out + +$o xsort A -t : -r +0 + +$o xsort B -t : +0 -1 + +xsort C -t : -r -k 1 + +xsort D -t : -k 1,1 + +#--------------------------------------------------------------- +TEST=07; echo $TEST # -t, character positions in fields + # -t: as 1 arg is not strictly conforming, but classical +cat <in +: ab +:bac +! +cat <out +:bac +: ab +! + +$o xsort A -b -t: +1.1 + +$o xsort B -t: +1.1r + +xsort C -b -t: -k 2.2 + +xsort D -t: -k 2.2r + +#--------------------------------------------------------------- +TEST=08; echo $TEST # space and tab as -t characters +cat <in + b c + b c + b c +! +cp in out + +xsort A -t ' ' -k2,2 + +xsort B -t ' ' -k2.1,2.0 + +cat <out + b c + b c + b c +! + +xsort C -t ' ' -k2,2 + +xsort D -t ' ' -k2.1,2.0 + +cat <out + b c + b c + b c +! + +xsort E -k2 + +cat <out + b c + b c + b c +! + +xsort F -k2b + +#--------------------------------------------------------------- +TEST=09; echo $TEST # alphabetic as -t character +cat <in +zXa +yXa +zXb +! +cp in out + +xsort "" -tX -k2 -k1r,1 + +#--------------------------------------------------------------- +TEST=10; echo $TEST # -m +cat <in +a +ab +ab +bc +ca +! +cat <in1 +Z +a +aa +ac +c +! +cat <out +Z +a +a +aa +ab +ab +ac +bc +c +ca +! + +sort -m in in1 >xx +cmp xx out >/dev/null || echo $TEST failed + +#--------------------------------------------------------------- +TEST=11; echo $TEST # multiple files, -o overwites input, -m, -mu +cat <in +a +b +c +d +! + +sort -o xx in in in in in in in in in in in in in in in in in +linecount A xx 68 +sort -o in -mu in in in in in in in in in in in in in in in in in +linecount B in 4 +sort -o in -m in in in in in in in in in in in in in in in in in + +cmp in xx >/dev/null || echo ${TEST}C failed + +#--------------------------------------------------------------- +TEST=12; echo $TEST # does -mu pick the first among equals? +cat <in +3B +3b +3B2 +~3B2 +4.1 +41 +5 +5. +! +cat <out +3B +3B2 +4.1 +5 +! + +xsort A -mudf || echo "(other behavior is legal, not classical)" + +xsort B -mudf -k1 || echo "(other behavior is legal, not classical)" + +#--------------------------------------------------------------- +TEST=13; echo $TEST # long records (>8000 bytes, keys >16000), -r +awk ' +BEGIN { x="x" + for(i=1; i<=12; i++) x = x x + for(i=15; i<=25; i++) print x i +}' >in +awk ' +BEGIN { x="x" + for(i=1; i<=12; i++) x = x x + for(i=25; i>=15; i--) print x i +}' >out + +xsort A -r + +xsort B -k 1,1r -k 1 + +#--------------------------------------------------------------- +TEST=14; echo $TEST "(3 long parts)" +awk 'BEGIN { for(i=0; i<100000; i++) print rand() }' | grep -v e >in +rm -f out + +xsort A; echo $TEST "(part A done)" + +xsort B -n; echo $TEST "(part B done)" + +# next test is unclean: xx is a hidden side-effect of xsort + +awk ' + $0 < x { print "test '${TEST}C' failed"; exit } + $0 "" != x { print >"out"; x = $0 } +' xx + +xsort C -n -u + +#--------------------------------------------------------------- +TEST=15; echo $TEST "(long)" # force intermediate files if possible +awk 'BEGIN { for(i=0; i<20000; i++) print rand() }' >in +rm -f out + +xsort A -r $y + +sort -r in | awk '$0 "x" != x { print ; x = $0 "x" }' >out + +xsort B -u -r $y + +#--------------------------------------------------------------- +TEST=16; echo $TEST # -nr, -nm, file name - +awk 'BEGIN { for(i=-100; i<=100; i+=2) printf "%.10d\n", i }' >in + +awk 'BEGIN { for(i=-99; i<=100; i+=2) print i }' | sort -nr in - >xx +awk '$0+0 != 101-NR { print "'${TEST}A' failed"; exit }' xx + +awk 'BEGIN { for(i=-99; i<=100; i+=2) print i }' | sort -mn in - >xx +awk '$0+0 != -101+NR { print "'${TEST}B' failed"; exit }' xx + +#--------------------------------------------------------------- +TEST=17; echo $TEST # -d, fields without end, modifier override +cat <in +a-B +a+b +a b +A+b +a b +! +cat <out +a b +a b +A+b +a-B +a+b +! + +$o xsort A -df +0 +0d + +xsort B -df -k 1 -k 1d + +#--------------------------------------------------------------- +TEST=18; echo $TEST # -u on key only +cat <in +12 y +13 z +12 x +! +cat <out +12 x +12 y +13 z +! + +$o xsort A +0 -1 + +xsort B -k 1,1 + +sort -u -k 1,1 in >xx +linecount C xx 2 + +#--------------------------------------------------------------- +TEST=19; echo $TEST # -i, -d, -f +cat <xx.c +run(i,j){ for( ; i<=j; i++) printf("%.3o %c\n",i,i); } +main(){ run(0, 011); /* 012=='\n' */ + run(013, 0377); } +! +cc xx.c +a.out >in +cat <xx.c +run(i,j){ for( ; i<=j; i++) printf("%.3o %c\n",i,i); } +main(){ run(0, 011); + run(013, ' '-1); + run(0177, 0377); + run(' ', 0176); } +! +cc xx.c +a.out >out + +xsort A -i -k 2 + +cat <xx.c +run(i,j){ for( ; i<=j; i++) printf("%.3o %c\n",i,i); } +main(){ run(0, 010); /* 011=='\t', 012=='\n' */ + run(013, ' '-1); + run(' '+1, '0'-1); + run('9'+1, 'A'-1); + run('Z'+1, 'a'-1); + run('z'+1, 0377); + run('\t', '\t'); + run(' ', ' '); + run('0', '9'); + run('A', 'Z'); + run('a', 'z'); } +! +cc xx.c +a.out >out + +xsort B -d -k 2 + +cat <xx.c +run(i,j){ for( ; i<=j; i++) printf("%.3o %c\n",i,i); } +main(){ int i; + run(0, 011); + run(013, 'A'-1); + for(i='A'; i<='Z'; i++) + printf("%.3o %c\n%.3o %c\n",i,i,i+040,i+040); + run('Z'+1, 'a'-1); + run('z'+1, 0377); } +! +cc xx.c +a.out >out +rm xx.c + +xsort C -f -k 2 + +#--------------------------------------------------------------- +TEST=20; echo $TEST # -d, -f, -b applies only to fields +cat <in + b +'C +a +! +cp in out + +xsort A -d + +xsort B -f + +cat <out + b +a +'C +! + +xsort C -dfb + +#--------------------------------------------------------------- +TEST=21; echo $TEST # behavior of null bytes +cat <<'!' >xx.c +main() { printf("%cb\n%ca\n",0,0); } +! +cc xx.c +a.out >in +sort in >xx +cmp in xx >/dev/null && echo ${TEST}A failed +test "`wc -c in +a 2 +a 1 +b 2 +b 1 +! +cat <out +b 1 +b 2 +a 1 +a 2 +! + +xsort "" -r -k1,1 -k2n + +#--------------------------------------------------------------- +TEST=23; echo $TEST # empty file + +sort -o xx /dev/null || echo ${TEST}A failed + +sort -c in +0:2:3:4:5:6:7:8:9 +1:1:3:4:5:6:7:8:9 +1:2:2:4:5:6:7:8:9 +1:2:3:3:5:6:7:8:9 +1:2:3:4:4:6:7:8:9 +1:2:3:4:5:5:7:8:9 +1:2:3:4:5:6:6:8:9 +1:2:3:4:5:6:7:7:9 +1:2:3:4:5:6:7:8:8 +! +cat <out +1:2:3:4:5:6:7:8:8 +1:2:3:4:5:6:7:7:9 +1:2:3:4:5:6:6:8:9 +1:2:3:4:5:5:7:8:9 +1:2:3:4:4:6:7:8:9 +1:2:3:3:5:6:7:8:9 +1:2:2:4:5:6:7:8:9 +1:1:3:4:5:6:7:8:9 +0:2:3:4:5:6:7:8:9 +! + +xsort "" -t: -k9 -k8 -k7 -k6 -k5 -k4 -k3 -k2 -k1 + +#--------------------------------------------------------------- +TEST=25; echo $TEST # variously specified alpha fields + # numbers give the correct orderings +cat <in +01:04:19:01:16:01:21:01 a +02:03:13:15:13:19:15:02 a +03:02:07:09:07:13:09:03 a +04:01:01:03:01:07:03:04 a +05:08:20:16:17:02:20:05 aa +06:07:14:18:14:20:14:06 aa +07:06:08:10:08:14:08:07 aa +08:05:02:04:02:08:02:08 aa +09:16:22:02:22:04:24:13 b +10:15:16:20:19:22:18:14 b +11:14:10:12:10:16:12:15 b +12:13:04:06:04:10:06:16 b +13:24:24:22:24:06:22:21 bb +14:23:18:24:21:24:16:22 bb +15:22:12:14:12:18:10:23 bb +16:21:06:08:06:12:04:24 bb +17:12:21:21:18:03:19:09 ab +18:11:15:19:15:21:13:10 ab +19:10:09:11:09:15:07:11 ab +20:09:03:05:03:09:01:12 ab +21:20:23:17:23:05:23:17 ba +22:19:17:23:20:23:17:18 ba +23:18:11:13:11:17:11:19 ba +24:17:05:07:05:11:05:20 ba +! +sort -k2b -k2 in >xx && + sort -c -t: -k2n xx 2>/dev/null || echo ${TEST}A failed +sort -k2,2.1b -k2 in >xx && + sort -c -t: -k3n xx 2>/dev/null || echo ${TEST}B failed +sort -k2.3 -k2 in >xx && + sort -c -t: -k4n xx 2>/dev/null || echo ${TEST}C failed +sort -k2b,2.3 -k2 in >xx && + sort -c -t: -k5n xx 2>/dev/null || echo ${TEST}D failed +sort -k2.3,2.1b -k2 in >xx && + sort -c -t: -k6n xx 2>/dev/null || echo ${TEST}E failed +sort -k2,2.1b -k2r in >xx && + sort -c -t: -k7n xx 2>/dev/null || echo ${TEST}F failed +sort -b -k2,2 -k2 in >xx && + sort -c -t: -k8n xx 2>/dev/null || echo ${TEST}G failed +sort -b -k2,2b -k2 in >xx && # perhaps same as G + sort -c -t: -k3n xx 2>/dev/null || echo ${TEST}H failed\ + "(standard is not clear on this)" + +#--------------------------------------------------------------- +TEST=26; echo $TEST # empty fields, out of bounds fields +cat <in +0 5 +1 4 +2 3 +3 2 +4 1 +5 0 +! +cp in out + +xsort "" -k2.2,2.1 -k2.3,2.4 + +#--------------------------------------------------------------- +TEST=27; echo $TEST # displaced -o +rm -f out + +$o sort /dev/null -o out || $o echo ${TEST}B failed +$o test -f out || $o echo ${TEST}C failed + +#--------------------------------------------------------------- +TEST=28; echo $TEST # apparently nonmonotone field specs +cat <in +aaaa c +x a +0 b +! +cp in out + +$o xsort A +1 -0.3 +1.4 -1.5 + +xsort B -k2,1.3 -k2.5,2.5 + +#--------------------------------------------------------------- +TEST=29; echo $TEST # determination of end of option list +cat >-k <xx || echo ${TEST}A argument failed +cmp xx -k || echo ${TEST}A comparison failed + +sort - -c /dev/null && echo ${TEST}B failed + +#--------------------------------------------------------------- +TEST=30; echo $TEST # missing newline +awk 'BEGIN{ printf "%s", "x"}' | sort >xx +wc -c in +jan 10 1900 +Feb 26 1900 +feb 25 1900 +January xx 1900 +August 11 1900 +jan 15 1990 +feb 22 1990 +mar 15 1990 +apr 1 1990 +may 45 1990 +jun 14 1990 +jul 4 1990 +aug 1~ 1990 +aug 11 1990 +sep 1 1990 +oct 12 1990 +nov 24 1990 +dec 25 1990 +never 3 1990 + Dec 25 1990 +! +cat <out +January xx 1900 +jan 10 1900 +feb 25 1900 +Feb 26 1900 +August 11 1900 +never 3 1990 +jan 15 1990 +feb 22 1990 +mar 15 1990 +apr 1 1990 +may 45 1990 +jun 14 1990 +jul 4 1990 +aug 1~ 1990 +aug 11 1990 +sep 1 1990 +oct 12 1990 +nov 24 1990 + Dec 25 1990 +dec 25 1990 +! + +$M xsort "" -k3n -k1M -k2n + +#--------------------------------------------------------------- +TEST=32; echo $TEST # -M case insensitivity, -r +cat <in +x +june +january +december +! +cat <out +december +june +january +x +! + +$M xsort "" -Mr + +#--------------------------------------------------------------- +TEST=33; echo $TEST # -g +cat <in +2 +1 +10 +.2 +1e +1E1 +1e. +! +cat <out +.2 +1 +1e +1e. +2 +10 +1E1 +! + +$g xsort "" -g + +#--------------------------------------------------------------- +TEST=34; echo $TEST # -g wide operands +cat <in +.99999999999999999999 +099999999999999999999e-21 +099999999999999999999e-19 +.1e1 +! +cat <out +099999999999999999999e-21 +.99999999999999999999 +.1e1 +099999999999999999999e-19 +! + +$g xsort A -g + +cat <out +.1e1 +.99999999999999999999 +099999999999999999999e-19 +099999999999999999999e-21 +! + +xsort B -n + +#--------------------------------------------------------------- +TEST=35; echo $TEST #-g, -u with different fp reps +cat <in ++0 +-0 +0.10 ++.1 +-.1 +-100e-3 +x +! +cat <out +-.1 +-100e-3 ++0 +-0 +x ++.1 +0.10 +! + +$g xsort A -g + +$g sort -gu in >xx && $g sort -c -gu xx || echo ${TEST}B failed +$g linecount C xx 3 + +#--------------------------------------------------------------- +TEST=36; echo $TEST # -s +cat <in +a 2 +b 1 +c 2 +a 1 +b 2 +c 1 +! +cat <out +a 2 +a 1 +b 1 +b 2 +c 2 +c 1 +! + +$s xsort "" -s -k1,1 + +#--------------------------------------------------------------- +TEST=37; echo $TEST # -s, multiple files +cat <in +a 2 +c 2 +! +cat <in1 +a 1 +b 1 +c 1 +! +cat <out +c 2 +b 1 +a 2 +! + +$s sort -smru -k1,1 in in in1 in1 >xx +$s cmp xx out >/dev/null || echo $TEST failed + +#--------------------------------------------------------------- +TEST=38; echo $TEST # -s +$s awk ' + BEGIN { + for(i=1; i<50; i++) + for(j=1; j<=i; j++) { + print i, 2 >"in" + print i, 1 >"in1" + } + }' + +$s sort -m -s -k1,1n in in1 >out + +$s awk ' + func stop() { print "'$TEST' failed"; exit } + $1!=last1 { if(count!=last1 || $2!=2) stop(); + count = 0} + $1==last1 && $2!=last2 { if(count!=last1 || $2!=1) stop(); + count = 0 } + { count++; last1 = $1; last2 = $2 } + ' out diff --git a/usr.bin/sort/append.c b/usr.bin/sort/append.c new file mode 100644 index 000000000000..ee258bbecab0 --- /dev/null +++ b/usr.bin/sort/append.c @@ -0,0 +1,188 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char sccsid[] = "@(#)append.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +#include "sort.h" + +#include +#include + +#define OUTPUT { \ + if ((n = cpos - ppos) > 1) { \ + for (; ppos < cpos; ++ppos) \ + *ppos -= odepth; \ + ppos -= n; \ + radixsort(ppos, n, wts1, REC_D); \ + for (; ppos < cpos; ppos++) { \ + prec = (RECHEADER *) (*ppos - sizeof(TRECHEADER));\ + put(prec, fd); \ + } \ + } else put(prec, fd); \ +} + +/* + * copy sorted lines to output; check for uniqueness + */ +void +append(keylist, nelem, depth, fd, put, ftbl) + u_char **keylist; + int nelem; + register int depth; + FILE *fd; + void (*put)(RECHEADER *, FILE *); + struct field *ftbl; +{ + register u_char *wts, *wts1; + register n, odepth; + register u_char **cpos, **ppos, **lastkey; + register u_char *cend, *pend, *start; + register struct recheader *crec, *prec; + + if (*keylist == '\0' && UNIQUE) + return; + wts1 = wts = ftbl[0].weights; + if ((!UNIQUE) && SINGL_FLD) { + if (ftbl[0].flags & F && ftbl[0].flags & R) + wts1 = Rascii; + else if (ftbl[0].flags & F) + wts1 = ascii; + odepth = depth; + } + lastkey = keylist + nelem; + depth += sizeof(TRECHEADER); + if (SINGL_FLD && (UNIQUE || wts1 != wts)) { + ppos = keylist; + prec = (RECHEADER *) (*ppos - depth); + if (UNIQUE) + put(prec, fd); + for (cpos = keylist+1; cpos < lastkey; cpos++) { + crec = (RECHEADER *) (*cpos - depth); + if (crec->length == prec->length) { + pend = (u_char *) &prec->offset + prec->length; + cend = (u_char *) &crec->offset + crec->length; + for (start = *cpos; cend >= start; cend--) { + if (wts[*cend] != wts[*pend]) + break; + pend--; + } + if (pend + 1 != *ppos) { + if (!UNIQUE) { + OUTPUT; + } else + put(crec, fd); + ppos = cpos; + prec = crec; + } + } else { + if (!UNIQUE) { + OUTPUT; + } else + put(crec, fd); + ppos = cpos; + prec = crec; + } + } + if (!UNIQUE) { OUTPUT; } + } else if (UNIQUE) { + ppos = keylist; + prec = (RECHEADER *) (*ppos - depth); + put(prec, fd); + for (cpos = keylist+1; cpos < lastkey; cpos++) { + crec = (RECHEADER *) (*cpos - depth); + if (crec->offset == prec->offset) { + pend = (u_char *) &prec->offset + prec->offset; + cend = (u_char *) &crec->offset + crec->offset; + for (start = *cpos; cend >= start; cend--) { + if (wts[*cend] != wts[*pend]) + break; + pend--; + } + if (pend + 1 != *ppos) { + ppos = cpos; + prec = crec; + put(prec, fd); + } + } else { + ppos = cpos; + prec = crec; + put(prec, fd); + } + } + } else for (cpos = keylist; cpos < lastkey; cpos++) { + crec = (RECHEADER *) (*cpos - depth); + put(crec, fd); + } +} + +/* + * output the already sorted eol bin. + */ +void +rd_append(binno, infl0, nfiles, outfd, buffer, bufend) + u_char *buffer, *bufend; + int binno, nfiles; + union f_handle infl0; + FILE *outfd; +{ + struct recheader *rec; + rec = (RECHEADER *) buffer; + if (!getnext(binno, infl0, nfiles, (RECHEADER *) buffer, bufend, 0)) { + putline(rec, outfd); + while (getnext(binno, infl0, nfiles, (RECHEADER *) buffer, + bufend, 0) == 0) { + if (!UNIQUE) + putline(rec, outfd); + } + } +} + +/* + * append plain text--used after sorting the biggest bin. + */ +void +concat(a, b) + FILE *a, *b; +{ + int nread; + char buffer[4096]; + + rewind(b); + while ((nread = fread(buffer, 1, 4096, b)) > 0) + EWRITE(buffer, 1, nread, a); +} diff --git a/usr.bin/sort/extern.h b/usr.bin/sort/extern.h new file mode 100644 index 000000000000..25917792df2c --- /dev/null +++ b/usr.bin/sort/extern.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)extern.h 8.1 (Berkeley) 6/6/93 + */ + +void append __P((u_char **, int, int, FILE *, void (*)(), struct field *)); +void concat __P((FILE *, FILE *)); +length_t enterkey __P((struct recheader *, + DBT *, int, struct field *)); +void fixit __P((int *, char **)); +void fldreset __P((struct field *)); +FILE *ftmp __P((void)); +void fmerge __P((int, union f_handle, + int, int (*)(), FILE *, void (*)(), struct field *)); +void fsort __P((int, int, union f_handle, int, FILE *, struct field *)); +int geteasy __P((int, union f_handle, + int, struct recheader *, u_char *, struct field *)); +int getnext __P((int, union f_handle, + int, struct recheader *, u_char *, struct field *)); +int makekey __P((int, union f_handle, + int, struct recheader *, u_char *, struct field *)); +int makeline __P((int, union f_handle, + int, struct recheader *, u_char *, struct field *)); +void merge __P((int, int, int (*)(), FILE *, void (*)(), struct field *)); +void num_init __P((void)); +void onepass __P((u_char **, int, long, long *, u_char *, FILE *)); +int optval __P((int, int)); +void order __P((union f_handle, int (*)(), struct field *)); +void putline __P((struct recheader *, FILE *)); +void putrec __P((struct recheader *, FILE *)); +void rd_append __P((int, union f_handle, int, FILE *, u_char *, u_char *)); +int seq __P((FILE *, DBT *, DBT *)); +int setfield __P((char *, struct field *, int)); +void settables __P((int)); diff --git a/usr.bin/sort/fields.c b/usr.bin/sort/fields.c new file mode 100644 index 000000000000..98a8bb9a4e73 --- /dev/null +++ b/usr.bin/sort/fields.c @@ -0,0 +1,319 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char sccsid[] = "@(#)fields.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +/* Subroutines to generate sort keys. */ + +#include "sort.h" + +#define blancmange(ptr) { \ + if (BLANK & d_mask[*(ptr)]) \ + while (BLANK & d_mask[*(++(ptr))]); \ +} + +#define NEXTCOL(pos) { \ + if (!SEP_FLAG) \ + while (BLANK & l_d_mask[*(++pos)]); \ + while (!((FLD_D | REC_D_F) & l_d_mask[*++pos])); \ +} + +extern u_char *enterfield __P((u_char *, u_char *, struct field *, int)); + +extern u_char *number __P((u_char *, u_char *, u_char *, u_char *, int)); + +extern struct coldesc clist[(ND+1)*2]; +extern int ncols; + +#define DECIMAL '.' +#define OFFSET 128 + +u_char TENS[10]; /* TENS[0] = REC_D <= 128 ? 130 - '0' : 2 -'0'... */ +u_char NEGTENS[10]; /* NEGTENS[0] = REC_D <= 128 ? 126 + '0' : 252 +'0' */ +u_char *OFF_TENS, *OFF_NTENS; /* TENS - '0', NEGTENS - '0' */ +u_char fnum[NBINS], rnum[NBINS]; + +/* + * constructs sort key with leading recheader, followed by the key, + * followed by the original line. + */ +length_t +enterkey(keybuf, line, size, fieldtable) + struct recheader *keybuf; /* pointer to start of key */ + DBT *line; + int size; + struct field fieldtable[]; +{ + int i; + register u_char *l_d_mask; + register u_char *lineend, *pos; + u_char *endkey, *keypos; + register struct coldesc *clpos; + register int col = 1; + struct field *ftpos; + l_d_mask = d_mask; + pos = (u_char *) line->data - 1; + lineend = (u_char *) line->data + line->size-1; + /* don't include rec_delimiter */ + keypos = keybuf->data; + + for (i = 0; i < ncols; i++) { + clpos = clist + i; + for (; (col < clpos->num) && (pos < lineend); col++) + { NEXTCOL(pos); } + if (pos >= lineend) + break; + clpos->start = SEP_FLAG ? pos + 1 : pos; + NEXTCOL(pos); + clpos->end = pos; + col++; + if (pos >= lineend) { + clpos->end = lineend; + ++i; + break; + } + } + for (; i <= ncols; i++) + clist[i].start = clist[i].end = lineend; + if (clist[0].start < (u_char *) line->data) + ++clist[0].start; + endkey = (u_char *) keybuf + size - line->size; + for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) + if ((keypos = enterfield(keypos, endkey, ftpos, + fieldtable->flags)) == NULL) + return (1); + + if (UNIQUE) + *(keypos-1) = REC_D; + keybuf->offset = keypos - keybuf->data; + keybuf->length = keybuf->offset + line->size; + if (keybuf->length + sizeof(TRECHEADER) > size) + return (1); /* line too long for buffer */ + memcpy(keybuf->data + keybuf->offset, line->data, line->size); + return (0); +} + +/* + * constructs a field (as defined by -k) within a key + */ +u_char * +enterfield(tablepos, endkey, cur_fld, gflags) + struct field *cur_fld; + register u_char *tablepos, *endkey; + int gflags; +{ + register u_char *start, *end, *lineend, *mask, *lweight; + struct column icol, tcol; + register u_int flags; + u_int Rflag; + icol = cur_fld->icol; + tcol = cur_fld->tcol; + flags = cur_fld->flags; + start = icol.p->start; + lineend = clist[ncols].end; + if (flags & BI) + blancmange(start); + start += icol.indent; + start = min(start, lineend); + if (!tcol.num) + end = lineend; + else { + if (tcol.indent) { + end = tcol.p->start; + if (flags & BT) blancmange(end); + end += tcol.indent; + end = min(end, lineend); + } else + end = tcol.p->end; + } + if (flags & N) { + Rflag = (gflags & R ) ^ (flags & R) ? 1 : 0; + tablepos = number(tablepos, endkey, start, end, Rflag); + return (tablepos); + } + mask = alltable; + mask = cur_fld->mask; + lweight = cur_fld->weights; + for (; start < end; start++) + if (mask[*start]) { + if (*start <= 1) { + if (tablepos+2 >= endkey) + return (NULL); + *tablepos++ = lweight[1]; + *tablepos++ = lweight[*start ? 2 : 1]; + } else { + *tablepos++ = lweight[*start]; + if (tablepos == endkey) + return (NULL); + } + } + *tablepos++ = lweight[0]; + return (tablepos == endkey ? NULL : tablepos); +} + +/* Uses the first bin to assign sign, expsign, 0, and the first + * 61 out of the exponent ( (254 - 3 origins - 4 over/underflows)/4 = 61 ). + * When sorting in forward order: + * use (0-99) -> (130->240) for sorting the mantissa if REC_D <=128; + * else use (0-99)->(2-102). + * If the exponent is >=61, use another byte for each additional 253 + * in the exponent. Cutoff is at 567. + * To avoid confusing the exponent and the mantissa, use a field delimiter + * if the exponent is exactly 61, 61+252, etc--this is ok, since it's the + * only time a field delimiter can come in that position. + * Reverse order is done analagously. +*/ + +u_char * +number(pos, bufend, line, lineend, Rflag) + register u_char *line, *pos, *bufend, *lineend; + int Rflag; +{ + register int or_sign, parity = 0; + register int expincr = 1, exponent = -1; + int bite, expsign = 1, sign = 1; + register u_char lastvalue, *nonzero, *tline, *C_TENS; + u_char *nweights; + + if (Rflag) + nweights = rnum; + else + nweights = fnum; + if (pos > bufend - 8) + return (NULL); + /* or_sign sets the sort direction: + * (-r: +/-)(sign: +/-)(expsign: +/-) */ + or_sign = sign ^ expsign ^ Rflag; + blancmange(line); + if (*line == '-') { /* set the sign */ + or_sign ^= 1; + sign = 0; + line++; + } + /* eat initial zeroes */ + for (; *line == '0' && line < lineend; line++); + /* calculate exponents < 0 */ + if (*line == DECIMAL) { + exponent = 1; + while (*++line == '0' && line < lineend) + exponent++; + expincr = 0; + expsign = 0; + } + /* next character better be a digit */ + if (*line < '1' || *line > '9' || line >= lineend) { + *pos++ = nweights[127]; + return (pos); + } + if (expincr) { + for (tline = line-1; *++tline >= '0' && + *tline <= '9' && tline < lineend;) + exponent++; + } + if (exponent > 567) { + *pos++ = nweights[sign ? (expsign ? 254 : 128) + : (expsign ? 0 : 126)]; + warnx("exponent out of bounds"); + return (pos); + } + bite = min(exponent, 61); + *pos++ = nweights[(sign) ? (expsign ? 189+bite : 189-bite) + : (expsign ? 64-bite : 64+bite)]; + if (bite >= 61) { + do { + exponent -= bite; + bite = min(exponent, 254); + *pos++ = nweights[or_sign ? 254-bite : bite]; + } while (bite == 254); + } + C_TENS = or_sign ? OFF_NTENS : OFF_TENS; + for (; line < lineend; line++) { + if (*line >= '0' && *line <= '9') { + if (parity) { + *pos++ = C_TENS[lastvalue] + (or_sign ? - *line + : *line); + if (pos == bufend) + return (NULL); + if (*line != '0' || lastvalue != '0') + nonzero = pos; + } else + lastvalue = *line; + parity ^= 1; + } else if(*line == DECIMAL) { + if(!expincr) /* a decimal already occurred once */ + break; + expincr = 0; + } else + break; + } + if (parity && lastvalue != '0') { + *pos++ = or_sign ? OFF_NTENS[lastvalue] - '0' : + OFF_TENS[lastvalue] + '0'; + } else + pos = nonzero; + if (pos > bufend-1) + return (NULL); + *pos++ = or_sign ? nweights[254] : nweights[0]; + return (pos); +} + +/* This forces a gap around the record delimiter + * Thus fnum has vaues over (0,254) -> ((0,REC_D-1),(REC_D+1,255)); + * rnum over (0,254) -> (255,REC_D+1),(REC_D-1,0)) +*/ +void +num_init() +{ + int i; + TENS[0] = REC_D <=128 ? 130 - '0' : 2 - '0'; + NEGTENS[0] = REC_D <=128 ? 126 + '0' : 254 + '0'; + OFF_TENS = TENS - '0'; + OFF_NTENS = NEGTENS - '0'; + for (i = 1; i < 10; i++) { + TENS[i] = TENS[i-1] + 10; + NEGTENS[i] = NEGTENS[i-1] - 10; + } + for (i = 0; i < REC_D; i++) { + fnum[i] = i; + rnum[255-i] = i; + } + for (i = REC_D; i <255; i++) { + fnum[i] = i+1; + rnum[255-i] = i-1; + } +} diff --git a/usr.bin/sort/files.c b/usr.bin/sort/files.c new file mode 100644 index 000000000000..0d27bc98346c --- /dev/null +++ b/usr.bin/sort/files.c @@ -0,0 +1,338 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char sccsid[] = "@(#)files.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +#include "sort.h" +#include "fsort.h" + +#include + +/* + * this is the subroutine for file management for fsort(). + * It keeps the buffers for all temporary files. + */ +int +getnext(binno, infl0, nfiles, pos, end, dummy) + int binno, nfiles; + union f_handle infl0; + register struct recheader *pos; + register u_char *end; + struct field *dummy; +{ + register int i; + register u_char *hp; + static long nleft = 0; + static int cnt = 0, flag = -1; + static u_char maxb = 0; + static FILE *fd; + + if (nleft == 0) { + if (binno < 0) /* reset files. */ { + for (i = 0; i < nfiles; i++) { + rewind(fstack[infl0.top + i].fd); + fstack[infl0.top + i].max_o = 0; + } + flag = -1; + nleft = cnt = 0; + return(-1); + } + maxb = fstack[infl0.top].maxb; + for (; nleft == 0; cnt++) { + if (cnt >= nfiles) { + cnt = 0; + return (EOF); + } + fd = fstack[infl0.top + cnt].fd; + hp = (u_char *) &nleft; + for (i = sizeof(TRECHEADER); i; --i) + *hp++ = getc(fd); + if (binno < maxb) + fstack[infl0.top+cnt].max_o + += sizeof(nleft) + nleft; + else if (binno == maxb) { + if (binno != fstack[infl0.top].lastb) { + fseek(fd, fstack[infl0.top+ + cnt].max_o, SEEK_SET); + fread(&nleft, sizeof(nleft), 1, fd); + } + if (nleft == 0) + fclose(fd); + } else if (binno == maxb + 1) { /* skip a bin */ + fseek(fd, nleft, SEEK_CUR); + fread(&nleft, sizeof(nleft), 1, fd); + flag = cnt; + } + } + } + if ((u_char *) pos > end - sizeof(TRECHEADER)) + return (BUFFEND); + hp = (u_char *) pos; + for (i = sizeof(TRECHEADER); i ; --i) + *hp++ = (u_char) getc(fd); + if (end - pos->data < pos->length) { + for (i = sizeof(TRECHEADER); i ; i--) + ungetc(*--hp, fd); + return (BUFFEND); + } + fread(pos->data, pos->length, 1, fd); + nleft -= pos->length + sizeof(TRECHEADER); + if (nleft == 0 && binno == fstack[infl0.top].maxb) + fclose(fd); + return (0); +} + +/* + * this is called when there is no special key. It's only called + * in the first fsort pass. + */ +int +makeline(flno, filelist, nfiles, buffer, bufend, dummy2) + int flno, nfiles; + union f_handle filelist; + struct recheader *buffer; + u_char *bufend; + struct field *dummy2; +{ + static char *opos; + register char *end, *pos; + static int fileno = 0, overflow = 0; + static FILE *fd = 0; + register int c; + + pos = (char *) buffer->data; + end = min((char *) bufend, pos + MAXLLEN); + if (overflow) { + memmove(pos, opos, bufend - (u_char *) opos); + pos += ((char *) bufend - opos); + overflow = 0; + } + for (;;) { + if (flno >= 0) { + if (!(fd = fstack[flno].fd)) + return (EOF); + } else if (!fd) { + if (fileno >= nfiles) return(EOF); + if (!(fd = fopen(filelist.names[fileno], "r"))) + err(2, "%s", filelist.names[fileno]); + ++fileno; + } + while ((pos < end) && ((c = getc(fd)) != EOF)) { + if ((*pos++ = c) == REC_D) { + buffer->offset = 0; + buffer->length = pos - (char *) buffer->data; + return (0); + } + } + if (pos >= end && end == (char *) bufend) { + if ((char *) buffer->data < end) { + overflow = 1; + opos = (char *) buffer->data; + } + return (BUFFEND); + } else if (c == EOF) { + if (buffer->data != (u_char *) pos) { + warnx("last character not record delimiter"); + *pos++ = REC_D; + buffer->offset = 0; + buffer->length = pos - (char *) buffer->data; + return(0); + } + FCLOSE(fd); + fd = 0; + if(flno >= 0) fstack[flno].fd = 0; + } else { + buffer->data[100] = '\000'; + warnx("line too long:ignoring %s...", buffer->data); + } + } +} + +/* + * This generates keys. It's only called in the first fsort pass + */ +int +makekey(flno, filelist, nfiles, buffer, bufend, ftbl) + int flno, nfiles; + union f_handle filelist; + struct recheader *buffer; + u_char *bufend; + struct field *ftbl; +{ + static int (*get)(); + static int fileno = 0; + static FILE *dbdesc = 0; + static DBT dbkey[1], line[1]; + static int overflow = 0; + int c; + if (overflow) { + overflow = 0; + enterkey(buffer, line, bufend - (u_char *) buffer, ftbl); + return (0); + } + for (;;) { + if (flno >= 0) { + get = seq; + if (!(dbdesc = fstack[flno].fd)) + return(EOF); + } else if (!dbdesc) { + if (fileno >= nfiles) + return (EOF); + dbdesc = fopen(filelist.names[fileno], "r"); + if (!dbdesc) + err(2, "%s", filelist.names[fileno]); + ++fileno; + get = seq; + } + if (!(c = get(dbdesc, line, dbkey))) { + if ((signed)line->size > bufend - buffer->data) + overflow = 1; + else + overflow = enterkey(buffer, line, + bufend - (u_char *) buffer, ftbl); + if (overflow) + return (BUFFEND); + else + return (0); + } + if (c == EOF) { + FCLOSE(dbdesc); + dbdesc = 0; + if (flno >= 0) fstack[flno].fd = 0; + } else { + + ((char *) line->data)[60] = '\000'; + warnx("line too long: ignoring %.100s...", + (char *)line->data); + } + + } +} + +/* + * get a key/line pair from fd + */ +int +seq(fd, line, key) + FILE *fd; + DBT *key, *line; +{ + static char *buf, flag = 1; + register char *end, *pos; + register int c; + if (flag) { + flag = 0; + buf = (char *) linebuf; + end = buf + MAXLLEN; + line->data = buf; + } + pos = buf; + while ((c = getc(fd)) != EOF) { + if ((*pos++ = c) == REC_D) { + line->size = pos - buf; + return (0); + } + if (pos == end) { + line->size = MAXLLEN; + *--pos = REC_D; + while ((c = getc(fd)) != EOF) { + if (c == REC_D) + return (BUFFEND); + } + } + } + if (pos != buf) { + warnx("last character not record delimiter"); + *pos++ = REC_D; + line->size = pos - buf; + return (0); + } else + return (EOF); +} + +/* + * write a key/line pair to a temporary file + */ +void +putrec(rec, fd) + register struct recheader *rec; + register FILE *fd; +{ + EWRITE(rec, 1, rec->length + sizeof(TRECHEADER), fd); +} + +/* + * write a line to output + */ +void +putline(rec, fd) + register struct recheader *rec; + register FILE *fd; +{ + EWRITE(rec->data+rec->offset, 1, rec->length - rec->offset, fd); +} + +/* + * get a record from a temporary file. (Used by merge sort.) + */ +int +geteasy(flno, filelist, nfiles, rec, end, dummy2) + int flno, nfiles; + union f_handle filelist; + register struct recheader *rec; + register u_char *end; + struct field *dummy2; +{ + int i; + FILE *fd; + fd = fstack[flno].fd; + if ((u_char *) rec > end - sizeof(TRECHEADER)) + return (BUFFEND); + if (!fread(rec, 1, sizeof(TRECHEADER), fd)) { + fclose(fd); + fstack[flno].fd = 0; + return (EOF); + } + if (end - rec->data < rec->length) { + for (i = sizeof(TRECHEADER) - 1; i >= 0; i--) + ungetc(*((char *) rec + i), fd); + return (BUFFEND); + } + fread(rec->data, rec->length, 1, fd); + return (0); +} diff --git a/usr.bin/sort/fsort.c b/usr.bin/sort/fsort.c new file mode 100644 index 000000000000..1ec948474f60 --- /dev/null +++ b/usr.bin/sort/fsort.c @@ -0,0 +1,286 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char sccsid[] = "@(#)fsort.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +/* + * Read in the next bin. If it fits in one segment sort it; + * otherwise refine it by segment deeper by one character, + * and try again on smaller bins. Sort the final bin at this level + * of recursion to keep the head of fstack at 0. + * After PANIC passes, abort to merge sort. +*/ +#include "sort.h" +#include "fsort.h" + +#include +#include + +u_char **keylist = 0, *buffer = 0, *linebuf = 0; +struct tempfile fstack[MAXFCT]; +extern char *toutpath; +#define FSORTMAX 4 +int PANIC = FSORTMAX; + +void +fsort(binno, depth, infiles, nfiles, outfd, ftbl) + register int binno, depth, nfiles; + register union f_handle infiles; + FILE *outfd; + register struct field *ftbl; +{ + register u_char *bufend, **keypos, *tmpbuf; + u_char *weights; + int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0; + register int c, nelem; + long sizes [NBINS+1]; + union f_handle tfiles, mstart = {MAXFCT-16}; + register int (*get)(int, union f_handle, int, RECHEADER *, + u_char *, struct field *); + register struct recheader *crec; + struct field tfield[2]; + FILE *prevfd, *tailfd[FSORTMAX+1]; + + memset(tailfd, 0, sizeof(tailfd)); + prevfd = outfd; + memset(tfield, 0, sizeof(tfield)); + if (ftbl[0].flags & R) + tfield[0].weights = Rascii; + else + tfield[0].weights = ascii; + tfield[0].icol.num = 1; + weights = ftbl[0].weights; + if (!buffer) { + buffer = malloc(BUFSIZE); + keylist = malloc(MAXNUM * sizeof(u_char *)); + if (!SINGL_FLD) + linebuf = malloc(MAXLLEN); + } + bufend = buffer + BUFSIZE; + if (binno >= 0) { + tfiles.top = infiles.top + nfiles; + get = getnext; + } else { + tfiles.top = 0; + if (SINGL_FLD) + get = makeline; + else + get = makekey; + } + for (;;) { + memset(sizes, 0, sizeof(sizes)); + c = ntfiles = 0; + if (binno == weights[REC_D] && + !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */ + rd_append(weights[REC_D], + infiles, nfiles, prevfd, buffer, bufend); + break; + } else if (binno == weights[REC_D]) { + depth = 0; /* start over on flat weights */ + ftbl = tfield; + weights = ftbl[0].weights; + } + while (c != EOF) { + keypos = keylist; + nelem = 0; + crec = (RECHEADER *) buffer; + while((c = get(binno, infiles, nfiles, crec, bufend, + ftbl)) == 0) { + *keypos++ = crec->data + depth; + if (++nelem == MAXNUM) { + c = BUFFEND; + break; + } + crec =(RECHEADER *) ((char *) crec + + SALIGN(crec->length) + sizeof(TRECHEADER)); + } + if (c == BUFFEND || ntfiles || mfct) { /* push */ + if (panic >= PANIC) { + fstack[MAXFCT-16+mfct].fd = ftmp(); + if (radixsort(keylist, nelem, weights, + REC_D)) + err(2, NULL); + append(keylist, nelem, depth, fstack[ + MAXFCT-16+mfct].fd, putrec, ftbl); + mfct++; + /* reduce number of open files */ + if (mfct == 16 ||(c == EOF && ntfiles)) { + tmpbuf = malloc(bufend - + crec->data); + memmove(tmpbuf, crec->data, + bufend - crec->data); + fstack[tfiles.top + ntfiles].fd + = ftmp(); + fmerge(0, mstart, mfct, geteasy, + fstack[tfiles.top+ntfiles].fd, + putrec, ftbl); + ++ntfiles; + mfct = 0; + memmove(crec->data, tmpbuf, + bufend - crec->data); + free(tmpbuf); + } + } else { + fstack[tfiles.top + ntfiles].fd= ftmp(); + onepass(keylist, depth, nelem, sizes, + weights, fstack[tfiles.top+ntfiles].fd); + ++ntfiles; + } + } + } + get = getnext; + if (!ntfiles && !mfct) { /* everything in memory--pop */ + if (nelem > 1) + if (radixsort(keylist, nelem, weights, REC_D)) + err(2, NULL); + append(keylist, nelem, depth, outfd, putline, ftbl); + break; /* pop */ + } + if (panic >= PANIC) { + if (!ntfiles) + fmerge(0, mstart, mfct, geteasy, + outfd, putline, ftbl); + else + fmerge(0, tfiles, ntfiles, geteasy, + outfd, putline, ftbl); + break; + + } + total = maxb = lastb = 0; /* find if one bin dominates */ + for (i = 0; i < NBINS; i++) + if (sizes[i]) { + if (sizes[i] > sizes[maxb]) + maxb = i; + lastb = i; + total += sizes[i]; + } + if (sizes[maxb] < max((total / 2) , BUFSIZE)) + maxb = lastb; /* otherwise pop after last bin */ + fstack[tfiles.top].lastb = lastb; + fstack[tfiles.top].maxb = maxb; + + /* start refining next level. */ + get(-1, tfiles, ntfiles, crec, bufend, 0); /* rewind */ + for (i = 0; i < maxb; i++) { + if (!sizes[i]) /* bin empty; step ahead file offset */ + get(i, tfiles, ntfiles, crec, bufend, 0); + else + fsort(i, depth+1, tfiles, ntfiles, outfd, ftbl); + } + if (lastb != maxb) { + if (prevfd != outfd) + tailfd[panic] = prevfd; + prevfd = ftmp(); + for (i = maxb+1; i <= lastb; i++) + if (!sizes[i]) + get(i, tfiles, ntfiles, crec, bufend,0); + else + fsort(i, depth+1, tfiles, ntfiles, + prevfd, ftbl); + } + + /* sort biggest (or last) bin at this level */ + depth++; + panic++; + binno = maxb; + infiles.top = tfiles.top; /* getnext will free tfiles, */ + nfiles = ntfiles; /* so overwrite them */ + } + if (prevfd != outfd) { + concat(outfd, prevfd); + fclose(prevfd); + } + for (i = panic; i >= 0; --i) + if (tailfd[i]) { + concat(outfd, tailfd[i]); + fclose(tailfd[i]); + } +} + +/* + This is one pass of radix exchange, dumping the bins to disk. + */ +#define swap(a, b, t) t = a, a = b, b = t +void +onepass(a, depth, n, sizes, tr, fd) + u_char **a; + int depth; + long n, sizes[]; + u_char *tr; + FILE *fd; +{ + long tsizes[NBINS+1]; + u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp; + static histo[256]; + int *hp; + register int c; + u_char **an, *t, **aj; + register u_char **ak, *r; + + memset(tsizes, 0, sizeof(tsizes)); + depth += sizeof(TRECHEADER); + an = a + n; + for (ak = a; ak < an; ak++) { + histo[c = tr[**ak]]++; + tsizes[c] += ((RECHEADER *) (*ak -= depth))->length; + } + + bin[0] = a; + bpmax = bin + 256; + tp = top, hp = histo; + for (bp = bin; bp < bpmax; bp++) { + *tp++ = *(bp+1) = *bp + (c = *hp); + *hp++ = 0; + if (c <= 1) + continue; + } + for(aj = a; aj < an; *aj = r, aj = bin[c+1]) + for(r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;) + swap(*ak, r, t); + + for (ak = a, c = 0; c < 256; c++) { + an = bin[c+1]; + n = an - ak; + tsizes[c] += n * sizeof(TRECHEADER); + /* tell getnext how many elements in this bin, this segment. */ + EWRITE(tsizes+c, sizeof(long), 1, fd); + sizes[c] += tsizes[c]; + for (; ak < an; ++ak) + putrec((RECHEADER *) *ak, fd); + } +} diff --git a/usr.bin/sort/fsort.h b/usr.bin/sort/fsort.h new file mode 100644 index 000000000000..54ffa0d3364d --- /dev/null +++ b/usr.bin/sort/fsort.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fsort.h 8.1 (Berkeley) 6/6/93 + */ + +#define POW 20 /* exponent for buffer size */ +#define BUFSIZE (1 << POW) +#define MAXNUM (BUFSIZE/10) /* lowish guess at average record size */ +#define BUFFEND (EOF-2) +#define MAXFCT 1000 +#define MAXLLEN ((1 << min(POW-4, 16)) - 14) + +extern u_char **keylist, **l2buf, *buffer, *linebuf; + +/* temp files in the stack have a file descriptor, a largest bin (maxb) + * which becomes the last non-empty bin (lastb) when the actual largest + * bin is smaller than max(half the total file, BUFSIZE) + * Max_o is the offset of maxb so it can be sought after the other bins + * are sorted. +*/ +struct tempfile { + FILE *fd; + u_char maxb; + u_char lastb; + long max_o; +}; +extern struct tempfile fstack[MAXFCT]; diff --git a/usr.bin/sort/init.c b/usr.bin/sort/init.c new file mode 100644 index 000000000000..9ff4bad93ae6 --- /dev/null +++ b/usr.bin/sort/init.c @@ -0,0 +1,326 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char sccsid[] = "@(#)init.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +#include "sort.h" + +#include +#include + +extern struct coldesc clist[(ND+1)*2]; +extern int ncols; +u_char gweights[NBINS]; + +/* + * clist (list of columns which correspond to one or more icol or tcol) + * is in increasing order of columns. + * Fields are kept in increasing order of fields. + */ + +/* + * keep clist in order--inserts a column in a sorted array + */ +static void +insertcol(field) + struct field *field; +{ + int i; + for (i = 0; i < ncols; i++) + if (field->icol.num <= clist[i].num) + break; + if (field->icol.num != clist[i].num) { + memmove(clist+i+1, clist+i, sizeof(COLDESC)*(ncols-i)); + clist[i].num = field->icol.num; + ncols++; + } + if (field->tcol.num && field->tcol.num != field->icol.num) { + for (i = 0; i < ncols; i++) + if (field->tcol.num <= clist[i].num) + break; + if (field->tcol.num != clist[i].num) { + memmove(clist+i+1, clist+i,sizeof(COLDESC)*(ncols-i)); + clist[i].num = field->tcol.num; + ncols++; + } + } +} + +/* + * matches fields with the appropriate columns--n^2 but who cares? + */ +void +fldreset(fldtab) + struct field *fldtab; +{ + int i; + fldtab[0].tcol.p = clist+ncols-1; + for (++fldtab; fldtab->icol.num; ++fldtab) { + for (i = 0; fldtab->icol.num != clist[i].num; i++); + fldtab->icol.p = clist + i; + if (!fldtab->tcol.num) + continue; + for (i = 0; fldtab->tcol.num != clist[i].num; i++); + fldtab->tcol.p = clist + i; + } +} + +/* + * interprets a column in a -k field + */ +char * +setcolumn(pos, cur_fld, gflag) + char *pos; + struct field *cur_fld; + int gflag; +{ + struct column *col; + int tmp; + col = cur_fld->icol.num ? (&(*cur_fld).tcol) : (&(*cur_fld).icol); + pos += sscanf(pos, "%d", &(col->num)); + while (isdigit(*pos)) + pos++; + if (col->num <= 0 && !(col->num == 0 && col == &(cur_fld->tcol))) + errx(2, "field numbers must be positive"); + if (*pos == '.') { + if (!col->num) + errx(2, "cannot indent end of line"); + pos += sscanf(++pos, "%d", &(col->indent)); + while (isdigit(*pos)) + pos++; + if (&cur_fld->icol == col) + col->indent--; + if (col->indent < 0) + errx(2, "illegal offset"); + } + if (optval(*pos, cur_fld->tcol.num)) + while (tmp = optval(*pos, cur_fld->tcol.num)) { + cur_fld->flags |= tmp; + pos++; + } + if (cur_fld->icol.num == 0) + cur_fld->icol.num = 1; + return (pos); +} + +int +setfield(pos, cur_fld, gflag) + char *pos; + struct field *cur_fld; + int gflag; +{ + static int nfields = 0; + int tmp; + char *setcolumn(); + if (++nfields == ND) + errx(2, "too many sort keys. (Limit is %d)", ND-1); + cur_fld->weights = ascii; + cur_fld->mask = alltable; + pos = setcolumn(pos, cur_fld, gflag); + if (*pos == '\0') /* key extends to EOL. */ + cur_fld->tcol.num = 0; + else { + if (*pos != ',') + errx(2, "illegal field descriptor"); + setcolumn((++pos), cur_fld, gflag); + } + if (!cur_fld->flags) + cur_fld->flags = gflag; + tmp = cur_fld->flags; + + /* + * Assign appropriate mask table and weight table. + * If the global weights are reversed, the local field + * must be "re-reversed". + */ + if (((tmp & R) ^ (gflag & R)) && tmp & F) + cur_fld->weights = RFtable; + else if (tmp & F) + cur_fld->weights = Ftable; + else if ((tmp & R) ^ (gflag & R)) + cur_fld->weights = Rascii; + if (tmp & I) + cur_fld->mask = itable; + else if (tmp & D) + cur_fld->mask = dtable; + cur_fld->flags |= (gflag & (BI | BT)); + if (!cur_fld->tcol.indent) /* BT has no meaning at end of field */ + cur_fld->flags &= (D|F|I|N|R|BI); + if (cur_fld->tcol.num && !(!(cur_fld->flags & BI) + && cur_fld->flags & BT) && (cur_fld->tcol.num <= cur_fld->icol.num + && cur_fld->tcol.indent < cur_fld->icol.indent)) + errx(2, "fields out of order"); + insertcol(cur_fld); + return (cur_fld->tcol.num); +} + +int +optval(desc, tcolflag) + int desc, tcolflag; +{ + switch(desc) { + case 'b': + if (!tcolflag) + return(BI); + else + return(BT); + case 'd': return(D); + case 'f': return(F); + case 'i': return(I); + case 'n': return(N); + case 'r': return(R); + default: return(0); + } +} + +void +fixit(argc, argv) + int *argc; + char **argv; +{ + int i, j, v, w, x; + static char vbuf[ND*20], *vpos, *tpos; + vpos = vbuf; + + for (i = 1; i < *argc; i++) { + if (argv[i][0] == '+') { + tpos = argv[i]+1; + argv[i] = vpos; + vpos += sprintf(vpos, "-k"); + tpos += sscanf(tpos, "%d", &v); + while (isdigit(*tpos)) + tpos++; + vpos += sprintf(vpos, "%d", v+1); + if (*tpos == '.') { + tpos += sscanf(++tpos, "%d", &x); + vpos += sprintf(vpos, ".%d", x+1); + } + while (*tpos) + *vpos++ = *tpos++; + vpos += sprintf(vpos, ","); + if (argv[i+1] && + argv[i+1][0] == '-' && isdigit(argv[i+1][1])) { + tpos = argv[i+1] + 1; + tpos += sscanf(tpos, "%d", &w); + while (isdigit(*tpos)) + tpos++; + x = 0; + if (*tpos == '.') { + tpos += sscanf(++tpos, "%d", &x); + while (isdigit(*tpos)) + *tpos++; + } + if (x) { + vpos += sprintf(vpos, "%d", w+1); + vpos += sprintf(vpos, ".%d", x); + } else + vpos += sprintf(vpos, "%d", w); + while (*tpos) + *vpos++ = *tpos++; + for (j= i+1; j < *argc; j++) + argv[j] = argv[j+1]; + *argc -= 1; + } + } + } +} + +/* + * ascii, Rascii, Ftable, and RFtable map + * REC_D -> REC_D; {not REC_D} -> {not REC_D}. + * gweights maps REC_D -> (0 or 255); {not REC_D} -> {not gweights[REC_D]}. + * Note: when sorting in forward order, to encode character zero in a key, + * use \001\001; character 1 becomes \001\002. In this case, character 0 + * is reserved for the field delimiter. Analagously for -r (fld_d = 255). + * Note: this is only good for ASCII sorting. For different LC 's, + * all bets are off. See also num_init in number.c + */ +void +settables(gflags) + int gflags; +{ + u_char *wts; + int i, incr; + for (i=0; i < 256; i++) { + ascii[i] = i; + if (i > REC_D && i < 255 - REC_D+1) + Rascii[i] = 255 - i + 1; + else + Rascii[i] = 255 - i; + if (islower(i)) { + Ftable[i] = Ftable[i- ('a' -'A')]; + RFtable[i] = RFtable[i - ('a' - 'A')]; + } else if (REC_D>= 'A' && REC_D < 'Z' && i < 'a' && i > REC_D) { + Ftable[i] = i + 1; + RFtable[i] = Rascii[i] - 1; + } else { + Ftable[i] = i; + RFtable[i] = Rascii[i]; + } + alltable[i] = 1; + if (i == '\n' || isprint(i)) + itable[i] = 1; + else itable[i] = 0; + if (i == '\n' || i == '\t' || i == ' ' || isalnum(i)) + dtable[i] = 1; + else dtable[i] = 0; + } + Rascii[REC_D] = RFtable[REC_D] = REC_D; + if (REC_D >= 'A' && REC_D < 'Z') + ++Ftable[REC_D + ('a' - 'A')]; + if (gflags & R && (!(gflags & F) || !SINGL_FLD)) + wts = Rascii; + else if (!(gflags & F) || !SINGL_FLD) + wts = ascii; + else if (gflags & R) + wts = RFtable; + else + wts = Ftable; + memmove(gweights, wts, sizeof(gweights)); + incr = (gflags & R) ? -1 : 1; + for (i = 0; i < REC_D; i++) + gweights[i] += incr; + gweights[REC_D] = ((gflags & R) ? 255 : 0); + if (SINGL_FLD && gflags & F) { + for (i = 0; i < REC_D; i++) { + ascii[i] += incr; + Rascii[i] += incr; + } + ascii[REC_D] = Rascii[REC_D] = gweights[REC_D]; + } +} diff --git a/usr.bin/sort/msort.c b/usr.bin/sort/msort.c new file mode 100644 index 000000000000..3a2866dcce89 --- /dev/null +++ b/usr.bin/sort/msort.c @@ -0,0 +1,304 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char sccsid[] = "@(#)msort.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +#include "sort.h" +#include "fsort.h" + +#include +#include +#include + +/* Subroutines using comparisons: merge sort and check order */ +#define DELETE (1) +#define LALIGN(n) ((n+3) & ~3) + +typedef struct mfile { + u_char *end; + short flno; + struct recheader rec[1]; +} MFILE; +typedef struct tmfile { + u_char *end; + short flno; + struct trecheader rec[1]; +} TMFILE; +u_char *wts, *wts1 = 0; +struct mfile *cfilebuf; + +static int cmp __P((struct recheader *, struct recheader *)); +static int insert __P((struct mfile **, struct mfile **, int, int)); + +void +fmerge(binno, files, nfiles, get, outfd, fput, ftbl) + union f_handle files; + int binno, nfiles; + int (*get)(); + FILE *outfd; + void (*fput)(); + struct field *ftbl; +{ + FILE *tout; + int i, j, last; + void (*put)(struct recheader *, FILE *); + extern int geteasy(); + struct tempfile *l_fstack; + + wts = ftbl->weights; + if (!UNIQUE && SINGL_FLD && ftbl->flags & F) + wts1 = (ftbl->flags & R) ? Rascii : ascii; + if (!cfilebuf) + cfilebuf = malloc(MAXLLEN + sizeof(TMFILE)); + + i = min(16, nfiles) * LALIGN(MAXLLEN+sizeof(TMFILE)); + if (!buffer || i > BUFSIZE) { + buffer = buffer ? realloc(buffer, i) : malloc(i); + if (!buffer) + err(2, NULL); + if (!SINGL_FLD) + linebuf = malloc(MAXLLEN); + } + + if (binno >= 0) + l_fstack = fstack + files.top; + else + l_fstack = fstack; + while (nfiles) { + put = putrec; + for (j = 0; j < nfiles; j += 16) { + if (nfiles <= 16) { + tout = outfd; + put = fput; + } + else + tout = ftmp(); + last = min(16, nfiles - j); + if (binno < 0) { + for (i = 0; i < last; i++) + if (!(l_fstack[i+MAXFCT-1-16].fd = + fopen(files.names[j + i], "r"))) + err(2, "%s", files.names[j+i]); + merge(MAXFCT-1-16, last, get, tout, put, ftbl); + } + else { + for (i = 0; i< last; i++) + rewind(l_fstack[i+j].fd); + merge(files.top+j, last, get, tout, put, ftbl); + } + if (nfiles > 16) l_fstack[j/16].fd = tout; + } + nfiles = (nfiles + 15) / 16; + if (nfiles == 1) + nfiles = 0; + if (binno < 0) { + binno = 0; + get = geteasy; + files.top = 0; + } + } +} + +void +merge(infl0, nfiles, get, outfd, put, ftbl) + int infl0, nfiles; + int (*get)(); + void (*put)(struct recheader *, FILE *); + FILE *outfd; + struct field *ftbl; +{ + int c, i, j; + union f_handle dummy = {0}; + struct mfile *flist[16], *cfile; + for (i = j = 0; i < nfiles; i++) { + cfile = (MFILE *) (buffer + + i * LALIGN(MAXLLEN + sizeof(TMFILE))); + cfile->flno = j + infl0; + cfile->end = cfile->rec->data + MAXLLEN; + for (c = 1; c == 1;) { + if (EOF == (c = get(j+infl0, dummy, nfiles, + cfile->rec, cfile->end, ftbl))) { + --i; + --nfiles; + break; + } + if (i) + c = insert(flist, &cfile, i, !DELETE); + else + flist[0] = cfile; + } + j++; + } + cfile = cfilebuf; + cfile->flno = flist[0]->flno; + cfile->end = cfile->rec->data + MAXLLEN; + while (nfiles) { + for (c = 1; c == 1;) { + if (EOF == (c = get(cfile->flno, dummy, nfiles, + cfile->rec, cfile->end, ftbl))) { + put(flist[0]->rec, outfd); + memmove(flist, flist + 1, + sizeof(MFILE *) * (--nfiles)); + cfile->flno = flist[0]->flno; + break; + } + if (!(c = insert(flist, &cfile, nfiles, DELETE))) + put(cfile->rec, outfd); + } + } +} + +/* + * if delete: inserts *rec in flist, deletes flist[0], and leaves it in *rec; + * otherwise just inserts *rec in flist. +*/ +static int +insert(flist, rec, ttop, delete) + struct mfile **flist, **rec; + int delete, ttop; /* delete = 0 or 1 */ +{ + register struct mfile *tmprec; + register int top, mid, bot = 0, cmpv = 1; + tmprec = *rec; + top = ttop; + for (mid = top/2; bot +1 != top; mid = (bot+top)/2) { + cmpv = cmp(tmprec->rec, flist[mid]->rec); + if (cmpv < 0) + top = mid; + else if (cmpv > 0) + bot = mid; + else { + if (!UNIQUE) + bot = mid - 1; + break; + } + } + if (delete) { + if (UNIQUE) { + if (!bot && cmpv) + cmpv = cmp(tmprec->rec, flist[0]->rec); + if (!cmpv) + return(1); + } + tmprec = flist[0]; + if (bot) + memmove(flist, flist+1, bot * sizeof(MFILE **)); + flist[bot] = *rec; + *rec = tmprec; + (*rec)->flno = (*flist)->flno; + return (0); + } + else { + if (!bot && !(UNIQUE && !cmpv)) { + cmpv = cmp(tmprec->rec, flist[0]->rec); + if (cmpv < 0) + bot = -1; + } + if (UNIQUE && !cmpv) + return (1); + bot++; + memmove(flist + bot+1, flist + bot, + (ttop - bot) * sizeof(MFILE **)); + flist[bot] = *rec; + return (0); + } +} + +/* + * check order on one file + */ +void +order(infile, get, ftbl) + union f_handle infile; + int (*get)(); + struct field *ftbl; +{ + u_char *end; + int c; + struct recheader *crec, *prec, *trec; + + if (!SINGL_FLD) + linebuf = malloc(MAXLLEN); + buffer = malloc(2 * (MAXLLEN + sizeof(TRECHEADER))); + end = buffer + 2 * (MAXLLEN + sizeof(TRECHEADER)); + crec = (RECHEADER *) buffer; + prec = (RECHEADER *) (buffer + MAXLLEN + sizeof(TRECHEADER)); + wts = ftbl->weights; + if (SINGL_FLD && ftbl->flags & F) + wts1 = ftbl->flags & R ? Rascii : ascii; + else + wts1 = 0; + if (0 == get(-1, infile, 1, prec, end, ftbl)) + while (0 == get(-1, infile, 1, crec, end, ftbl)) { + if (0 < (c = cmp(prec, crec))) { + crec->data[crec->length-1] = 0; + errx(1, "found disorder: %s", crec->data+crec->offset); + } + if (UNIQUE && !c) { + crec->data[crec->length-1] = 0; + errx(1, "found non-uniqueness: %s", + crec->data+crec->offset); + } + trec = prec; + prec = crec; + crec = trec; + } + exit(0); +} + +static int +cmp(rec1, rec2) + struct recheader *rec1, *rec2; +{ + register r; + register u_char *pos1, *pos2, *end; + register u_char *cwts; + for (cwts = wts; cwts; cwts = (cwts == wts1 ? 0 : wts1)) { + pos1 = rec1->data; + pos2 = rec2->data; + if (!SINGL_FLD && UNIQUE) + end = pos1 + min(rec1->offset, rec2->offset); + else + end = pos1 + min(rec1->length, rec2->length); + for (; pos1 < end; ) { + if (r = cwts[*pos1++] - cwts[*pos2++]) + return (r); + } + } + return (0); +} diff --git a/usr.bin/sort/pathnames.h b/usr.bin/sort/pathnames.h new file mode 100644 index 000000000000..1d64083cbdc0 --- /dev/null +++ b/usr.bin/sort/pathnames.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pathnames.h 8.1 (Berkeley) 6/6/93 + */ + +#define _PATH_STDIN "/dev/stdin" +#define _PATH_SORTTMP "/var/tmp/sort.XXXXXXXX" diff --git a/usr.bin/sort/sort.1 b/usr.bin/sort/sort.1 new file mode 100644 index 000000000000..d3bb6670149e --- /dev/null +++ b/usr.bin/sort/sort.1 @@ -0,0 +1,364 @@ +.\" Copyright (c) 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" the Institute of Electrical and Electronics Engineers, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)sort.1 8.1 (Berkeley) 6/6/93 +.\" +.Dd June 6, 1993 +.Dt SORT 1 +.Os +.Sh NAME +.Nm sort +.Nd sort or merge text files +.Sh SYNOPSIS +.Nm sort +.Op Fl cmubdfinr +.Op Fl t Ar char +.Op Fl T Ar char +.Oo +.Cm Fl k Ar field1[,field2] +.Oc +.Ar ... +.Op Fl o Ar output +.Op Ar file +.Ar ... +.Sh DESCRIPTION +The +.Nm sort +utility +sorts text files by lines. +Comparisons are based on one or more sort keys extracted +from each line of input, and are performed +lexicographically. By default, if keys are not given, +.Nm sort +regards each input line as a single field. +.Pp +The following options are available: +.Bl -tag -width indent +.It Fl c +Check that the single input file is sorted. +If the file is not sorted, +.Nm sort +produces the appropriate error messages and exits with code 1; +otherwise, +.Nm sort +returns 0. +.Nm Sort +.Fl c +produces no output. +.It Fl m +Merge only; the input files are assumed to be pre-sorted. +.It Fl o Ar output +The argument given is the name of an +.Ar output +file to +be used instead of the standard output. +This file +can be the same as one of the input files. +.It Fl u +Unique: suppress all but one in each set of lines +having equal keys. +If used with the +.Fl c +option, +check that there are no lines with duplicate keys. +.El +.Pp +The following options override the default ordering rules. +When ordering options appear independent of key field +specifications, the requested field ordering rules are +applied globally to all sort keys. +When attached to a specific key (see +.Fl k ) , +the ordering options override +all global ordering options for that key. +.Bl -tag -width indent +.It Fl d +Only blank space and alphanumeric characters +.\" according +.\" to the current setting of LC_CTYPE +are used +in making comparisons. +.It Fl f +Considers all lowercase characters that have uppercase +equivalents to be the same for purposes of +comparison. +.It Fl i +Ignore all non-printable characters. +.It Fl n +An initial numeric string, consisting of optional +blank space, optional minus sign, and zero or more +digits (including decimal point) +.\" with +.\" optional radix character and thousands +.\" separator +.\" (as defined in the current locale), +is sorted by arithmetic value. +(The +.Fl n +option no longer implies +the +.Fl b +option.) +.It Fl r +Reverse the sense of comparisons. +.El +.Pp +The treatment of field separators can be altered using the +options: +.Bl -tag -width indent +.It Fl b +Ignores leading blank space when determining the start +and end of a restricted sort key. +A +.Fl b +option specified before the first +.Fl k +option applies globally to all +.Fl k +options. +Otherwise, the +.Fl b +option can be +attached independently to each +.Ar field +argument of the +.Fl k +option (see below). +Note that the +.Fl b +option +has no effect unless key fields are specified. +.It Fl t Ar char +.Ar Char +is used as the field separator character. The initial +.Ar char +is not considered to be part of a field when determining +key offsets (see below). +Each occurrence of +.Ar char +is significant (for example, +.Dq Ar charchar +delimits an empty field). +If +.Fl t +is not specified, +blank space characters are used as default field +separators. +.It Fl T Ar char +.Ar Char +is used as the record separator character. +This should be used with discretion; +.Fl T Ar +usually produces undesirable results. +The default line separator is newline. +.It Fl k Ar field1[,field2] +Designates the starting position, +.Ar field1 , +and optional ending position, +.Ar field2 , +of a key field. +The +.Fl k +option replaces the obsolescent options +.Cm \(pl Ns Ar pos1 +and +.Fl Ns Ar pos2 . +.El +.Pp +The following operands are available: +.Bl -tag -width indent +.Ar file +The pathname of a file to be sorted, merged, or checked. +If no file +operands are specified, or if +a file operand is +.Fl , +the standard input is used. +.Pp +A field is +defined as a minimal sequence of characters followed by a +field separator or a newline character. +By default, the first +blank space of a sequence of blank spaces acts as the field separator. +All blank spaces in a sequence of blank spaces are considered +as part of the next field; for example, all blank spaces at +the beginning of a line are considered to be part of the +first field. +.Pp +Fields are specified +by the +.Fl k Ar field1[,field2] +argument. A missing +.Ar field2 +argument defaults to the end of a line. +.Pp +The arguments +.Ar field1 +and +.Ar field2 +have the form +.Em m.n +followed by one or more of the options +.Fl b , d , f , i , +.Fl n , r . +A +.Ar field1 +position specified by +.Em m.n +.Em (m,n > 0) +is interpreted as the +.Em n Ns th +character in the +.Em m Ns th +field. +A missing +.Em \&.n +in +.Ar field1 +means +.Ql \&.1 , +indicating the first character of the +.Em m Ns th +field; +If the +.Fl b +option is in effect, +.Em n +is counted from the first +non-blank character in the +.Em m Ns th +field; +.Em m Ns \&.1b +refers to the first +non-blank character in the +.Em m Ns th +field. +.Pp +A +.Ar field2 +position specified by +.Em m.n +is interpreted as +the +.Em n Ns th +character (including separators) of the +.Em m Ns th +field. +A missing +.Em \&.n +indicates the last character of the +.Em m Ns th +field; +.Em m += \&0 +designates the end of a line. +Thus the option +.Fl k Ar v.x,w.y +is synonymous with the obsolescent option +.Cm \(pl Ns Ar v-\&1.x-\&1 +.Fl Ns Ar w-\&1.y ; +when +.Em y +is omitted, +.Fl k Ar v.x,w +is synonymous with +.Cm \(pl Ns Ar v-\&1.x-\&1 +.Fl Ns Ar w+1.0 . +The obsolescent +.Cm \(pl Ns Ar pos1 +.Fl Ns Ar pos2 +option is still supported, except for +.Fl Ns Ar w\&.0b, +which has no +.Fl k +equivalent. +.Sh FILES +.Bl -tag -width Pa -compact +.It Pa /var/tmp/sort.* +Default temporary directories. +.It Pa Ar output Ns #PID +Temporary name for +.Ar output +if +.Ar output +already exists. +.El +.Sh SEE ALSO +.Xr comm 1 , +.Xr uniq 1 , +.Xr join 1 +.Sh RETURN VALUES +Sort exits with one of the following values: +.Bl -tag -width flag -compact +.It Pa 0: +normal behavior. +.It Pa 1: +on disorder (or non-uniqueness) with the +.Fl c +option +.It Pa 2: +an error occurred. +.Sh BUGS +Lines longer than 65522 characters are discarded and processing continues. +To sort files larger than 60Mb, use +.Nm sort +.Fl H ; +files larger than 704Mb must be sorted in smaller pieces, then merged. +To protect data +.Nm sort +.Fl o +calls link and unlink, and thus fails in protected directories. +.Sh HISTORY +A +.Nm sort +command appeared in +.At v6 . +.Sh NOTES +The current sort command uses lexicographic radix sorting, which requires +that sort keys be kept in memory (as opposed to previous versions which used quick +and merge sorts and did not.) +Thus performance depends highly on efficient choice of sort keys, and the +.Fl b +option and the +.Ar field2 +argument of the +.Fl k +option should be used whenever possible. +Similarly, +.Nm sort +.Fl k1f +is equivalent to +.Nm sort +.Fl f +and may take twice as long. diff --git a/usr.bin/sort/sort.c b/usr.bin/sort/sort.c new file mode 100644 index 000000000000..e7df90162719 --- /dev/null +++ b/usr.bin/sort/sort.c @@ -0,0 +1,279 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char copyright[] = +"@(#) Copyright (c) 1993\n\ + The Regents of the University of California. All rights reserved.\n"; +#endif /* not lint */ + +#ifndef lint +static char sccsid[] = "@(#)sort.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +/* Sort sorts a file using an optional user-defined key. + * Sort uses radix sort for internal sorting, and allows + * a choice of merge sort and radix sort for external sorting. + */ + +#include "sort.h" +#include "fsort.h" +#include "pathnames.h" + +#include +#include +#include +#include +#include + +int REC_D = '\n'; +u_char d_mask[NBINS]; /* flags for rec_d, field_d, */ +/* + * weight tables. Gweights is one of ascii, Rascii.. + * modified to weight rec_d = 0 (or 255) + */ +extern u_char gweights[NBINS]; +u_char ascii[NBINS], Rascii[NBINS], RFtable[NBINS], Ftable[NBINS]; +/* + * masks of ignored characters. Alltable is 256 ones + */ +u_char dtable[NBINS], itable[NBINS], alltable[NBINS]; +int SINGL_FLD = 0, SEP_FLAG = 0, UNIQUE = 0; +struct coldesc clist[(ND+1)*2]; +int ncols = 0; +extern struct coldesc clist[(ND+1)*2]; +extern int ncols; + +char devstdin[] = _PATH_STDIN; +char toutpath[_POSIX_PATH_MAX]; + +static void cleanup __P((void)); +static void onsig __P((int)); +static void usage __P((char *)); + +int +main(argc, argv) + int argc; + char *argv[]; +{ + extern int optind; + extern char *optarg; + int (*get)(); + int ch, i, stdinflag = 0, tmp = 0; + char cflag = 0, mflag = 0, nflag = 0; + char *outfile, *outpath = 0; + struct field fldtab[ND+2], *ftpos; + union f_handle filelist; + FILE *outfd; + memset(fldtab, 0, (ND+2)*sizeof(struct field)); + memset(d_mask, 0, NBINS); + d_mask[REC_D = '\n'] = REC_D_F; + SINGL_FLD = SEP_FLAG = 0; + d_mask['\t'] = d_mask[' '] = BLANK | FLD_D; + ftpos = fldtab; + fixit(&argc, argv); + while ((ch = getopt(argc, argv, "bcdfik:mHno:rt:T:ux")) != EOF) { + switch (ch) { + case 'b': fldtab->flags |= BI | BT; + break; + case 'd': + case 'i': + case 'f': + case 'r': tmp |= optval(ch, 0); + if (tmp & R && tmp & F) + fldtab->weights = RFtable; + else if (tmp & F) + fldtab->weights = Ftable; + else if(tmp & R) + fldtab->weights = Rascii; + fldtab->flags |= tmp; + break; + case 'o': + outpath = optarg; + break; + case 'n': + nflag = 1; + setfield("1n", ++ftpos, fldtab->flags&(~R)); + break; + case 'k': + setfield(optarg, ++ftpos, fldtab->flags); + break; + case 't': + if (SEP_FLAG) + usage("multiple field delimiters"); + SEP_FLAG = 1; + d_mask[' '] &= ~FLD_D; + d_mask['\t'] &= ~FLD_D; + d_mask[*optarg] |= FLD_D; + if (d_mask[*optarg] & REC_D_F) + err(2, "record/field delimiter clash"); + break; + case 'T': + if (REC_D != '\n') + usage("multiple record delimiters"); + if ('\n' == (REC_D = *optarg)) + break; + d_mask['\n'] = d_mask[' ']; + d_mask[REC_D] = REC_D_F; + break; + case 'u': + UNIQUE = 1; + break; + case 'c': + cflag = 1; + break; + case 'm': + mflag = 1; + break; + case 'H': + PANIC = 0; + break; + case '?': + default: usage(""); + } + } + if (cflag && argc > optind+1) + errx(2, "too many input files for -c option"); + if (argc - 2 > optind && !strcmp(argv[argc-2], "-o")) { + outpath = argv[argc-1]; + argc -= 2; + } + if (mflag && argc - optind > (MAXFCT - (16+1))*16) + errx(2, "too many input files for -m option"); + for (i = optind; i < argc; i++) { + /* allow one occurrence of /dev/stdin */ + if (!strcmp(argv[i], "-") || !strcmp(argv[i], devstdin)) { + if (stdinflag) + warnx("ignoring extra \"%s\" in file list", + argv[i]); + else { + stdinflag = 1; + argv[i] = devstdin; + } + } else if (ch = access(argv[i], R_OK)) + err(2, "%s", argv[i]); + } + if (!(fldtab->flags & (I|D) || fldtab[1].icol.num)) { + SINGL_FLD = 1; + fldtab[0].icol.num = 1; + } else { + if (!fldtab[1].icol.num) { + fldtab[0].flags &= ~(BI|BT); + setfield("1", ++ftpos, fldtab->flags); + } + if (nflag) + fldtab[1].flags |= fldtab->flags; + fldreset(fldtab); + fldtab[0].flags &= ~F; + } + settables(fldtab[0].flags); + num_init(); + fldtab->weights = gweights; + if (optind == argc) + argv[--optind] = devstdin; + filelist.names = argv+optind; + if (SINGL_FLD) + get = makeline; + else + get = makekey; + if (cflag) { + order(filelist, get, fldtab); + /* NOT REACHED */ + } + if (!outpath) { + (void)snprintf(toutpath, + sizeof(toutpath), "%sstdout", _PATH_DEV); + outfile = outpath = toutpath; + } else if (!(ch = access(outpath, 0)) && + strncmp(_PATH_DEV, outpath, 5)) { + struct sigaction act = {0, SIG_BLOCK, 6}; + int sigtable[] = {SIGHUP, SIGINT, SIGPIPE, SIGXCPU, SIGXFSZ, + SIGVTALRM, SIGPROF, 0}; + errno = 0; + if (access(outpath, W_OK)) + err(2, "%s", outpath); + act.sa_handler = cleanup; + (void)snprintf(toutpath, sizeof(toutpath), "%sXXXX", outpath); + outfile = mktemp(toutpath); + if (!outfile) + err(2, "%s", toutpath); + (void)atexit(cleanup); + for (i = 0; sigtable[i]; ++i) /* always unlink toutpath */ + sigaction(sigtable[i], &act, 0); + } else outfile = outpath; + if (!(outfd = fopen(outfile, "w"))) + err(2, "%s", outfile); + if (mflag) + fmerge(-1, filelist, argc-optind, get, outfd, putline, fldtab); + else + fsort(-1, 0, filelist, argc-optind, outfd, fldtab); + if (outfile != outpath) { + if (access(outfile, 0)) + err(2, "%s", outfile); + (void)unlink(outpath); + if (link(outfile, outpath)) + err(2, "cannot link %s: output left in %s", + outpath, outfile); + (void)unlink(outfile); + } + exit(0); +} + +static void +onsig(s) + int s; +{ + cleanup(); + exit(2); /* return 2 on error/interrupt */ +} + +static void +cleanup() +{ + if (toutpath[0]) + (void)unlink(toutpath); +} + +static void +usage(msg) + char *msg; +{ + if (msg) + (void)fprintf(stderr, "sort: %s\n", msg); + (void)fprintf(stderr, "usage: [-o output] [-cmubdfinr] [-t char] "); + (void)fprintf(stderr, "[-T char] [-k keydef] ... [files]\n"); + exit(2); +} diff --git a/usr.bin/sort/sort.h b/usr.bin/sort/sort.h new file mode 100644 index 000000000000..d164b5ac4212 --- /dev/null +++ b/usr.bin/sort/sort.h @@ -0,0 +1,142 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sort.h 8.1 (Berkeley) 6/6/93 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define NBINS 256 +#define MAXMERGE 16 + +/* values for masks, weights, and other flags. */ +#define I 1 /* mask out non-printable characters */ +#define D 2 /* sort alphanumeric characters only */ +#define N 4 /* Field is a number */ +#define F 8 /* weight lower and upper case the same */ +#define R 16 /* Field is reversed with respect to the global weight */ +#define BI 32 /* ignore blanks in icol */ +#define BT 64 /* ignore blanks in tcol */ + +/* masks for delimiters: blanks, fields, and termination. */ +#define BLANK 1 /* ' ', '\t'; '\n' if -T is invoked */ +#define FLD_D 2 /* ' ', '\t' default; from -t otherwise */ +#define REC_D_F 4 /* '\n' default; from -T otherwise */ + +#define ND 10 /* limit on number of -k options. */ + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) + +#define FCLOSE(file) { \ + if (EOF == fclose(file)) \ + err(2, "%s", file); \ +} + +#define EWRITE(ptr, size, n, f) { \ + if (!fwrite(ptr, size, n, f)) \ + err(2, NULL); \ +} + +/* length of record is currently limited to 2^16 - 1 */ +typedef u_short length_t; + +#define SALIGN(n) ((n+1) & ~1) + +/* a record is a key/line pair starting at rec.data. It has a total length + * and an offset to the start of the line half of the pair. + */ +typedef struct recheader { + length_t length; + length_t offset; + u_char data[1]; +} RECHEADER; + +typedef struct trecheader { + length_t length; + length_t offset; +} TRECHEADER; + +/* This is the column as seen by struct field. It is used by enterfield. + * They are matched with corresponding coldescs during initialization. + */ +struct column { + struct coldesc *p; + int num; + int indent; +}; + +/* a coldesc has a number and pointers to the beginning and end of the + * corresponding column in the current line. This is determined in enterkey. + */ +typedef struct coldesc { + u_char *start; + u_char *end; + int num; +} COLDESC; + +/* A field has an initial and final column; an omitted final column + * implies the end of the line. Flags regulate omission of blanks and + * numerical sorts; mask determines which characters are ignored (from -i, -d); + * weights determines the sort weights of a character (from -f, -r). + */ +struct field { + struct column icol; + struct column tcol; + u_int flags; + u_char *mask; + u_char *weights; +}; + +union f_handle { + int top; + char **names; +}; +extern int PANIC; /* maximum depth of fsort before fmerge is called */ +extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS]; +extern u_char alltable[NBINS], dtable[NBINS], itable[NBINS]; +extern u_char d_mask[NBINS]; +extern int SINGL_FLD, SEP_FLAG, UNIQUE; +extern int REC_D; + +#include "extern.h" diff --git a/usr.bin/sort/tmp.c b/usr.bin/sort/tmp.c new file mode 100644 index 000000000000..0e7241f91cf2 --- /dev/null +++ b/usr.bin/sort/tmp.c @@ -0,0 +1,87 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Peter McIlroy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char sccsid[] = "@(#)tmp.c 8.1 (Berkeley) 6/6/93"; +#endif /* not lint */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pathnames.h" + +#define _NAME_TMP "sort.XXXXXXXX" + +FILE * +ftmp() +{ + static char *envtmp; + sigset_t set, oset; + static int first = 0; + FILE *fd; + char pathb[_POSIX_PATH_MAX], *path; + + path = pathb; + if (!first && !envtmp) { + envtmp = getenv("TMPDIR"); + first = 1; + } + if (envtmp) + (void)snprintf(path, + sizeof(pathb), "%s/%s", envtmp, _NAME_TMP); + else { + memmove(path, _PATH_SORTTMP, sizeof(_PATH_SORTTMP)); + } + sigfillset(&set); + (void)sigprocmask(SIG_BLOCK, &set, &oset); + path = mktemp(path); + if (!path) + err(2, "%s"); + if (!(fd = fopen(path, "w+"))) + err(2, "%s", path); + (void)unlink(path); + + (void)sigprocmask(SIG_SETMASK, &oset, NULL); + return (fd); +};