NetBSD/gnu/usr.bin/sort/sort.c

1863 lines
43 KiB
C

/* sort - sort lines of text (with all kinds of options).
Copyright (C) 1988, 1991 Free Software Foundation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Written December 1988 by Mike Haertel.
The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
or (US mail) as Mike Haertel c/o Free Software Foundation. */
#ifdef HAVE_CONFIG_H
#if defined (CONFIG_BROKETS)
/* We use <config.h> instead of "config.h" so that a compilation
using -I. -I$srcdir will use ./config.h rather than $srcdir/config.h
(which it would do because it found this file in $srcdir). */
#include <config.h>
#else
#include "config.h"
#endif
#endif
/* Get isblank from GNU libc. */
#define _GNU_SOURCE
#include <ctype.h>
#ifndef ISBLANK
#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
static void usage ();
#define min(a, b) ((a) < (b) ? (a) : (b))
#define UCHAR_LIM (UCHAR_MAX + 1)
#define UCHAR(c) ((unsigned char) (c))
#ifdef isascii
#define ISALNUM(c) (isascii(c) && isalnum(c))
#define ISDIGIT(c) (isascii(c) && isdigit(c))
#define ISPRINT(c) (isascii(c) && isprint(c))
#define ISLOWER(c) (isascii(c) && islower(c))
#else
#define ISALNUM(c) isalnum(c)
#define ISDIGIT(c) isdigit(c)
#define ISPRINT(c) isprint(c)
#define ISLOWER(c) islower(c)
#endif
/* The kind of blanks for '-b' to skip in various options. */
enum blanktype { bl_start, bl_end, bl_both };
/* The name this program was run with. */
char *program_name;
/* Table of digits. */
static int digits[UCHAR_LIM];
/* Table of white space. */
static int blanks[UCHAR_LIM];
/* Table of non-printing characters. */
static int nonprinting[UCHAR_LIM];
/* Table of non-dictionary characters (not letters, digits, or blanks). */
static int nondictionary[UCHAR_LIM];
/* Translation table folding lower case to upper. */
static char fold_toupper[UCHAR_LIM];
/* Table mapping 3-letter month names to integers.
Alphabetic order allows binary search. */
static struct month
{
char *name;
int val;
} const monthtab[] =
{
{"APR", 4},
{"AUG", 8},
{"DEC", 12},
{"FEB", 2},
{"JAN", 1},
{"JUL", 7},
{"JUN", 6},
{"MAR", 3},
{"MAY", 5},
{"NOV", 11},
{"OCT", 10},
{"SEP", 9}
};
/* During the merge phase, the number of files to merge at once. */
#define NMERGE 16
/* Initial buffer size for in core sorting. Will not grow unless a
line longer than this is seen. */
static int sortalloc = 524288;
/* Initial buffer size for in core merge buffers. Bear in mind that
up to NMERGE * mergealloc bytes may be allocated for merge buffers. */
static int mergealloc = 16384;
/* Guess of average line length. */
static int linelength = 30;
/* Maximum number of elements for the array(s) of struct line's, in bytes. */
#define LINEALLOC 262144
/* Prefix for temporary file names. */
static char *temp_file_prefix;
/* Flag to reverse the order of all comparisons. */
static int reverse;
/* Flag for stable sort. This turns off the last ditch bytewise
comparison of lines, and instead leaves lines in the same order
they were read if all keys compare equal. */
static int stable;
/* Tab character separating fields. If NUL, then fields are separated
by the empty string between a non-whitespace character and a whitespace
character. */
static char tab;
/* Flag to remove consecutive duplicate lines from the output.
Only the last of a sequence of equal lines will be output. */
static int unique;
/* Nonzero if any of the input files are the standard input. */
static int have_read_stdin;
/* Lines are held in core as counted strings. */
struct line
{
char *text; /* Text of the line. */
int length; /* Length not including final newline. */
char *keybeg; /* Start of first key. */
char *keylim; /* Limit of first key. */
};
/* Arrays of lines. */
struct lines
{
struct line *lines; /* Dynamically allocated array of lines. */
int used; /* Number of slots used. */
int alloc; /* Number of slots allocated. */
int limit; /* Max number of slots to allocate. */
};
/* Input buffers. */
struct buffer
{
char *buf; /* Dynamically allocated buffer. */
int used; /* Number of bytes used. */
int alloc; /* Number of bytes allocated. */
int left; /* Number of bytes left after line parsing. */
};
/* Lists of key field comparisons to be tried. */
static struct keyfield
{
int sword; /* Zero-origin 'word' to start at. */
int schar; /* Additional characters to skip. */
int skipsblanks; /* Skip leading white space at start. */
int eword; /* Zero-origin first word after field. */
int echar; /* Additional characters in field. */
int skipeblanks; /* Skip trailing white space at finish. */
int *ignore; /* Boolean array of characters to ignore. */
char *translate; /* Translation applied to characters. */
int numeric; /* Flag for numeric comparison. */
int month; /* Flag for comparison by month name. */
int reverse; /* Reverse the sense of comparison. */
struct keyfield *next; /* Next keyfield to try. */
} keyhead;
/* The list of temporary files. */
static struct tempnode
{
char *name;
struct tempnode *next;
} temphead;
/* Clean up any remaining temporary files. */
static void
cleanup ()
{
struct tempnode *node;
for (node = temphead.next; node; node = node->next)
unlink (node->name);
}
/* Allocate N bytes of memory dynamically, with error checking. */
char *
xmalloc (n)
unsigned n;
{
char *p;
p = malloc (n);
if (p == 0)
{
error (0, 0, "virtual memory exhausted");
cleanup ();
exit (2);
}
return p;
}
/* Change the size of an allocated block of memory P to N bytes,
with error checking.
If P is NULL, run xmalloc.
If N is 0, run free and return NULL. */
char *
xrealloc (p, n)
char *p;
unsigned n;
{
if (p == 0)
return xmalloc (n);
if (n == 0)
{
free (p);
return 0;
}
p = realloc (p, n);
if (p == 0)
{
error (0, 0, "virtual memory exhausted");
cleanup ();
exit (2);
}
return p;
}
static FILE *
xfopen (file, how)
char *file, *how;
{
FILE *fp = strcmp (file, "-") ? fopen (file, how) : stdin;
if (fp == 0)
{
error (0, errno, "%s", file);
cleanup ();
exit (2);
}
if (fp == stdin)
have_read_stdin = 1;
return fp;
}
static void
xfclose (fp)
FILE *fp;
{
fflush (fp);
if (fp != stdin && fp != stdout)
{
if (fclose (fp) != 0)
{
error (0, errno, "error closing file");
cleanup ();
exit (2);
}
}
else
/* Allow reading stdin from tty more than once. */
clearerr (fp);
}
static void
xfwrite (buf, size, nelem, fp)
char *buf;
int size, nelem;
FILE *fp;
{
if (fwrite (buf, size, nelem, fp) != nelem)
{
error (0, errno, "write error");
cleanup ();
exit (2);
}
}
/* Return a name for a temporary file. */
static char *
tempname ()
{
static int seq;
int len = strlen (temp_file_prefix);
char *name = xmalloc (len + 16);
struct tempnode *node =
(struct tempnode *) xmalloc (sizeof (struct tempnode));
if (len && temp_file_prefix[len - 1] != '/')
sprintf (name, "%s/sort%5.5d%5.5d", temp_file_prefix, getpid (), ++seq);
else
sprintf (name, "%ssort%5.5d%5.5d", temp_file_prefix, getpid (), ++seq);
node->name = name;
node->next = temphead.next;
temphead.next = node;
return name;
}
/* Search through the list of temporary files for NAME;
remove it if it is found on the list. */
static void
zaptemp (name)
char *name;
{
struct tempnode *node, *temp;
for (node = &temphead; node->next; node = node->next)
if (!strcmp (name, node->next->name))
break;
if (node->next)
{
temp = node->next;
unlink (temp->name);
free (temp->name);
node->next = temp->next;
free ((char *) temp);
}
}
/* Initialize the character class tables. */
static void
inittables ()
{
int i;
for (i = 0; i < UCHAR_LIM; ++i)
{
if (ISBLANK (i))
blanks[i] = 1;
if (ISDIGIT (i))
digits[i] = 1;
if (!ISPRINT (i))
nonprinting[i] = 1;
if (!ISALNUM (i) && !ISBLANK (i))
nondictionary[i] = 1;
if (ISLOWER (i))
fold_toupper[i] = toupper (i);
else
fold_toupper[i] = i;
}
}
/* Initialize BUF, allocating ALLOC bytes initially. */
static void
initbuf (buf, alloc)
struct buffer *buf;
int alloc;
{
buf->alloc = alloc;
buf->buf = xmalloc (buf->alloc);
buf->used = buf->left = 0;
}
/* Fill BUF reading from FP, moving buf->left bytes from the end
of buf->buf to the beginning first. If EOF is reached and the
file wasn't terminated by a newline, supply one. Return a count
of bytes buffered. */
static int
fillbuf (buf, fp)
struct buffer *buf;
FILE *fp;
{
int cc;
bcopy (buf->buf + buf->used - buf->left, buf->buf, buf->left);
buf->used = buf->left;
while (!feof (fp) && (buf->used == 0 || !memchr (buf->buf, '\n', buf->used)))
{
if (buf->used == buf->alloc)
{
buf->alloc *= 2;
buf->buf = xrealloc (buf->buf, buf->alloc);
}
cc = fread (buf->buf + buf->used, 1, buf->alloc - buf->used, fp);
if (ferror (fp))
{
error (0, errno, "read error");
cleanup ();
exit (2);
}
buf->used += cc;
}
if (feof (fp) && buf->used && buf->buf[buf->used - 1] != '\n')
{
if (buf->used == buf->alloc)
{
buf->alloc *= 2;
buf->buf = xrealloc (buf->buf, buf->alloc);
}
buf->buf[buf->used++] = '\n';
}
return buf->used;
}
/* Initialize LINES, allocating space for ALLOC lines initially.
LIMIT is the maximum possible number of lines to allocate space
for, ever. */
static void
initlines (lines, alloc, limit)
struct lines *lines;
int alloc;
int limit;
{
lines->alloc = alloc;
lines->lines = (struct line *) xmalloc (lines->alloc * sizeof (struct line));
lines->used = 0;
lines->limit = limit;
}
/* Return a pointer to the first character of the field specified
by KEY in LINE. */
static char *
begfield (line, key)
struct line *line;
struct keyfield *key;
{
register char *ptr = line->text, *lim = ptr + line->length;
register int sword = key->sword, schar = key->schar;
if (tab)
while (ptr < lim && sword--)
{
while (ptr < lim && *ptr != tab)
++ptr;
if (ptr < lim)
++ptr;
}
else
while (ptr < lim && sword--)
{
while (ptr < lim && blanks[UCHAR (*ptr)])
++ptr;
while (ptr < lim && !blanks[UCHAR (*ptr)])
++ptr;
}
if (key->skipsblanks)
while (ptr < lim && blanks[UCHAR (*ptr)])
++ptr;
while (ptr < lim && schar--)
++ptr;
return ptr;
}
/* Return the limit of (a pointer to the first character after) the field
in LINE specified by KEY. */
static char *
limfield (line, key)
struct line *line;
struct keyfield *key;
{
register char *ptr = line->text, *lim = ptr + line->length;
register int eword = key->eword, echar = key->echar;
if (tab)
while (ptr < lim && eword--)
{
while (ptr < lim && *ptr != tab)
++ptr;
if (ptr < lim && (eword || key->skipeblanks))
++ptr;
}
else
while (ptr < lim && eword--)
{
while (ptr < lim && blanks[UCHAR (*ptr)])
++ptr;
while (ptr < lim && !blanks[UCHAR (*ptr)])
++ptr;
}
if (key->skipeblanks)
while (ptr < lim && blanks[UCHAR (*ptr)])
++ptr;
while (ptr < lim && echar--)
++ptr;
return ptr;
}
/* Find the lines in BUF, storing pointers and lengths in LINES.
Also replace newlines with NULs. */
static void
findlines (buf, lines)
struct buffer *buf;
struct lines *lines;
{
register char *beg = buf->buf, *lim = buf->buf + buf->used, *ptr;
struct keyfield *key = keyhead.next;
lines->used = 0;
while (beg < lim && (ptr = memchr (beg, '\n', lim - beg))
&& lines->used < lines->limit)
{
/* There are various places in the code that rely on a NUL
being at the end of in-core lines; NULs inside the lines
will not cause trouble, though. */
*ptr = '\0';
if (lines->used == lines->alloc)
{
lines->alloc *= 2;
lines->lines = (struct line *)
xrealloc ((char *) lines->lines,
lines->alloc * sizeof (struct line));
}
lines->lines[lines->used].text = beg;
lines->lines[lines->used].length = ptr - beg;
/* Precompute the position of the first key for efficiency. */
if (key)
{
if (key->eword >= 0)
lines->lines[lines->used].keylim =
limfield (&lines->lines[lines->used], key);
else
lines->lines[lines->used].keylim = ptr;
if (key->sword >= 0)
lines->lines[lines->used].keybeg =
begfield (&lines->lines[lines->used], key);
else
{
if (key->skipsblanks)
while (blanks[UCHAR (*beg)])
++beg;
lines->lines[lines->used].keybeg = beg;
}
}
else
{
lines->lines[lines->used].keybeg = 0;
lines->lines[lines->used].keylim = 0;
}
++lines->used;
beg = ptr + 1;
}
buf->left = lim - beg;
}
/* Compare strings A and B containing decimal fractions < 1. Each string
should begin with a decimal point followed immediately by the digits
of the fraction. Strings not of this form are considered to be zero. */
static int
fraccompare (a, b)
register char *a, *b;
{
register tmpa = UCHAR (*a), tmpb = UCHAR (*b);
if (tmpa == '.' && tmpb == '.')
{
do
tmpa = UCHAR (*++a), tmpb = UCHAR (*++b);
while (tmpa == tmpb && digits[tmpa]);
if (digits[tmpa] && digits[tmpb])
return tmpa - tmpb;
if (digits[tmpa])
{
while (tmpa == '0')
tmpa = UCHAR (*++a);
if (digits[tmpa])
return 1;
return 0;
}
if (digits[tmpb])
{
while (tmpb == '0')
tmpb = UCHAR (*++b);
if (digits[tmpb])
return -1;
return 0;
}
return 0;
}
else if (tmpa == '.')
{
do
tmpa = UCHAR (*++a);
while (tmpa == '0');
if (digits[tmpa])
return 1;
return 0;
}
else if (tmpb == '.')
{
do
tmpb = UCHAR (*++b);
while (tmpb == '0');
if (digits[tmpb])
return -1;
return 0;
}
return 0;
}
/* Compare strings A and B as numbers without explicitly converting them to
machine numbers. Comparatively slow for short strings, but asymptotically
hideously fast. */
static int
numcompare (a, b)
register char *a, *b;
{
register int tmpa, tmpb, loga, logb, tmp;
tmpa = UCHAR (*a), tmpb = UCHAR (*b);
while (blanks[tmpa])
tmpa = UCHAR (*++a);
while (blanks[tmpb])
tmpb = UCHAR (*++b);
if (tmpa == '-')
{
tmpa = UCHAR (*++a);
if (tmpb != '-')
{
if (digits[tmpa] && digits[tmpb])
return -1;
return 0;
}
tmpb = UCHAR (*++b);
while (tmpa == '0')
tmpa = UCHAR (*++a);
while (tmpb == '0')
tmpb = UCHAR (*++b);
while (tmpa == tmpb && digits[tmpa])
tmpa = UCHAR (*++a), tmpb = UCHAR (*++b);
if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa]))
return -fraccompare (a, b);
if (digits[tmpa])
for (loga = 1; digits[UCHAR (*++a)]; ++loga)
;
else
loga = 0;
if (digits[tmpb])
for (logb = 1; digits[UCHAR (*++b)]; ++logb)
;
else
logb = 0;
if ((tmp = logb - loga) != 0)
return tmp;
if (!loga)
return 0;
return tmpb - tmpa;
}
else if (tmpb == '-')
{
if (digits[UCHAR (tmpa)] && digits[UCHAR (*++b)])
return 1;
return 0;
}
else
{
while (tmpa == '0')
tmpa = UCHAR (*++a);
while (tmpb == '0')
tmpb = UCHAR (*++b);
while (tmpa == tmpb && digits[tmpa])
tmpa = UCHAR (*++a), tmpb = UCHAR (*++b);
if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa]))
return fraccompare (a, b);
if (digits[tmpa])
for (loga = 1; digits[UCHAR (*++a)]; ++loga)
;
else
loga = 0;
if (digits[tmpb])
for (logb = 1; digits[UCHAR (*++b)]; ++logb)
;
else
logb = 0;
if ((tmp = loga - logb) != 0)
return tmp;
if (!loga)
return 0;
return tmpa - tmpb;
}
}
/* Return an integer <= 12 associated with month name S with length LEN,
0 if the name in S is not recognized. */
static int
getmonth (s, len)
char *s;
int len;
{
char month[4];
register int i, lo = 0, hi = 12;
while (len > 0 && blanks[UCHAR(*s)])
++s, --len;
if (len < 3)
return 0;
for (i = 0; i < 3; ++i)
month[i] = fold_toupper[UCHAR (s[i])];
month[3] = '\0';
while (hi - lo > 1)
if (strcmp (month, monthtab[(lo + hi) / 2].name) < 0)
hi = (lo + hi) / 2;
else
lo = (lo + hi) / 2;
if (!strcmp (month, monthtab[lo].name))
return monthtab[lo].val;
return 0;
}
/* Compare two lines A and B trying every key in sequence until there
are no more keys or a difference is found. */
static int
keycompare (a, b)
struct line *a, *b;
{
register char *texta, *textb, *lima, *limb, *translate;
register int *ignore;
struct keyfield *key;
int diff = 0, iter = 0, lena, lenb;
for (key = keyhead.next; key; key = key->next, ++iter)
{
ignore = key->ignore;
translate = key->translate;
/* Find the beginning and limit of each field. */
if (iter || a->keybeg == NULL || b->keybeg == NULL)
{
if (key->eword >= 0)
lima = limfield (a, key), limb = limfield (b, key);
else
lima = a->text + a->length, limb = b->text + b->length;
if (key->sword >= 0)
texta = begfield (a, key), textb = begfield (b, key);
else
{
texta = a->text, textb = b->text;
if (key->skipsblanks)
{
while (texta < lima && blanks[UCHAR (*texta)])
++texta;
while (textb < limb && blanks[UCHAR (*textb)])
++textb;
}
}
}
else
{
/* For the first iteration only, the key positions have
been precomputed for us. */
texta = a->keybeg, lima = a->keylim;
textb = b->keybeg, limb = b->keylim;
}
/* Find the lengths. */
lena = lima - texta, lenb = limb - textb;
if (lena < 0)
lena = 0;
if (lenb < 0)
lenb = 0;
/* Actually compare the fields. */
if (key->numeric)
{
if (*lima || *limb)
{
char savea = *lima, saveb = *limb;
*lima = *limb = '\0';
diff = numcompare (texta, textb);
*lima = savea, *limb = saveb;
}
else
diff = numcompare (texta, textb);
if (diff)
return key->reverse ? -diff : diff;
continue;
}
else if (key->month)
{
diff = getmonth (texta, lena) - getmonth (textb, lenb);
if (diff)
return key->reverse ? -diff : diff;
continue;
}
else if (ignore && translate)
while (texta < lima && textb < limb)
{
while (texta < lima && ignore[UCHAR (*texta)])
++texta;
while (textb < limb && ignore[UCHAR (*textb)])
++textb;
if (texta < lima && textb < limb &&
translate[UCHAR (*texta++)] != translate[UCHAR (*textb++)])
{
diff = translate[UCHAR (*--texta)] - translate[UCHAR (*--textb)];
break;
}
}
else if (ignore)
while (texta < lima && textb < limb)
{
while (texta < lima && ignore[UCHAR (*texta)])
++texta;
while (textb < limb && ignore[UCHAR (*textb)])
++textb;
if (texta < lima && textb < limb && *texta++ != *textb++)
{
diff = *--texta - *--textb;
break;
}
}
else if (translate)
while (texta < lima && textb < limb)
{
if (translate[UCHAR (*texta++)] != translate[UCHAR (*textb++)])
{
diff = translate[UCHAR (*--texta)] - translate[UCHAR (*--textb)];
break;
}
}
else
diff = memcmp (texta, textb, min (lena, lenb));
if (diff)
return key->reverse ? -diff : diff;
if ((diff = lena - lenb) != 0)
return key->reverse ? -diff : diff;
}
return 0;
}
/* Compare two lines A and B, returning negative, zero, or positive
depending on whether A compares less than, equal to, or greater than B. */
static int
compare (a, b)
register struct line *a, *b;
{
int diff, tmpa, tmpb, mini;
/* First try to compare on the specified keys (if any).
The only two cases with no key at all are unadorned sort,
and unadorned sort -r. */
if (keyhead.next)
{
diff = keycompare (a, b);
if (diff != 0)
return diff;
if (unique || stable)
return 0;
}
/* If the keys all compare equal (or no keys were specified)
fall through to the default byte-by-byte comparison. */
tmpa = a->length, tmpb = b->length;
mini = min (tmpa, tmpb);
if (mini == 0)
diff = tmpa - tmpb;
else
{
char *ap = a->text, *bp = b->text;
diff = UCHAR (*ap) - UCHAR (*bp);
if (diff == 0)
{
diff = memcmp (ap, bp, mini);
if (diff == 0)
diff = tmpa - tmpb;
}
}
return reverse ? -diff : diff;
}
/* Check that the lines read from the given FP come in order. Return
1 if they do and 0 if there is a disorder. */
static int
checkfp (fp)
FILE *fp;
{
struct buffer buf; /* Input buffer. */
struct lines lines; /* Lines scanned from the buffer. */
struct line temp; /* Copy of previous line. */
int cc; /* Character count. */
int cmp; /* Result of calling compare. */
int alloc, i, success = 1;
initbuf (&buf, mergealloc);
initlines (&lines, mergealloc / linelength + 1,
LINEALLOC / ((NMERGE + NMERGE) * sizeof (struct line)));
alloc = linelength;
temp.text = xmalloc (alloc);
cc = fillbuf (&buf, fp);
findlines (&buf, &lines);
if (cc)
do
{
/* Compare each line in the buffer with its successor. */
for (i = 0; i < lines.used - 1; ++i)
{
cmp = compare (&lines.lines[i], &lines.lines[i + 1]);
if ((unique && cmp >= 0) || (cmp > 0))
{
success = 0;
goto finish;
}
}
/* Save the last line of the buffer and refill the buffer. */
if (lines.lines[lines.used - 1].length > alloc)
{
while (lines.lines[lines.used - 1].length + 1 > alloc)
alloc *= 2;
temp.text = xrealloc (temp.text, alloc);
}
bcopy (lines.lines[lines.used - 1].text, temp.text,
lines.lines[lines.used - 1].length + 1);
temp.length = lines.lines[lines.used - 1].length;
cc = fillbuf (&buf, fp);
if (cc)
{
findlines (&buf, &lines);
/* Make sure the line saved from the old buffer contents is
less than or equal to the first line of the new buffer. */
cmp = compare (&temp, &lines.lines[0]);
if ((unique && cmp >= 0) || (cmp > 0))
{
success = 0;
break;
}
}
}
while (cc);
finish:
xfclose (fp);
free (buf.buf);
free ((char *) lines.lines);
free (temp.text);
return success;
}
/* Merge lines from FPS onto OFP. NFPS cannot be greater than NMERGE.
Close FPS before returning. */
static void
mergefps (fps, nfps, ofp)
FILE *fps[], *ofp;
register int nfps;
{
struct buffer buffer[NMERGE]; /* Input buffers for each file. */
struct lines lines[NMERGE]; /* Line tables for each buffer. */
struct line saved; /* Saved line for unique check. */
int savedflag = 0; /* True if there is a saved line. */
int savealloc; /* Size allocated for the saved line. */
int cur[NMERGE]; /* Current line in each line table. */
int ord[NMERGE]; /* Table representing a permutation of fps,
such that lines[ord[0]].lines[cur[ord[0]]]
is the smallest line and will be next
output. */
register int i, j, t;
/* Allocate space for a saved line if necessary. */
if (unique)
{
savealloc = linelength;
saved.text = xmalloc (savealloc);
}
/* Read initial lines from each input file. */
for (i = 0; i < nfps; ++i)
{
initbuf (&buffer[i], mergealloc);
/* If a file is empty, eliminate it from future consideration. */
while (i < nfps && !fillbuf (&buffer[i], fps[i]))
{
xfclose (fps[i]);
--nfps;
for (j = i; j < nfps; ++j)
fps[j] = fps[j + 1];
}
if (i == nfps)
free (buffer[i].buf);
else
{
initlines (&lines[i], mergealloc / linelength + 1,
LINEALLOC / ((NMERGE + NMERGE) * sizeof (struct line)));
findlines (&buffer[i], &lines[i]);
cur[i] = 0;
}
}
/* Set up the ord table according to comparisons among input lines.
Since this only reorders two items if one is strictly greater than
the other, it is stable. */
for (i = 0; i < nfps; ++i)
ord[i] = i;
for (i = 1; i < nfps; ++i)
if (compare (&lines[ord[i - 1]].lines[cur[ord[i - 1]]],
&lines[ord[i]].lines[cur[ord[i]]]) > 0)
t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
/* Repeatedly output the smallest line until no input remains. */
while (nfps)
{
/* If uniqified output is turned out, output only the first of
an identical series of lines. */
if (unique)
{
if (savedflag && compare (&saved, &lines[ord[0]].lines[cur[ord[0]]]))
{
xfwrite (saved.text, 1, saved.length, ofp);
putc ('\n', ofp);
savedflag = 0;
}
if (!savedflag)
{
if (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1)
{
while (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1)
savealloc *= 2;
saved.text = xrealloc (saved.text, savealloc);
}
saved.length = lines[ord[0]].lines[cur[ord[0]]].length;
bcopy (lines[ord[0]].lines[cur[ord[0]]].text, saved.text,
saved.length + 1);
if (lines[ord[0]].lines[cur[ord[0]]].keybeg != NULL)
{
saved.keybeg = saved.text +
(lines[ord[0]].lines[cur[ord[0]]].keybeg
- lines[ord[0]].lines[cur[ord[0]]].text);
}
if (lines[ord[0]].lines[cur[ord[0]]].keylim != NULL)
{
saved.keylim = saved.text +
(lines[ord[0]].lines[cur[ord[0]]].keylim
- lines[ord[0]].lines[cur[ord[0]]].text);
}
savedflag = 1;
}
}
else
{
xfwrite (lines[ord[0]].lines[cur[ord[0]]].text, 1,
lines[ord[0]].lines[cur[ord[0]]].length, ofp);
putc ('\n', ofp);
}
/* Check if we need to read more lines into core. */
if (++cur[ord[0]] == lines[ord[0]].used)
if (fillbuf (&buffer[ord[0]], fps[ord[0]]))
{
findlines (&buffer[ord[0]], &lines[ord[0]]);
cur[ord[0]] = 0;
}
else
{
/* We reached EOF on fps[ord[0]]. */
for (i = 1; i < nfps; ++i)
if (ord[i] > ord[0])
--ord[i];
--nfps;
xfclose (fps[ord[0]]);
free (buffer[ord[0]].buf);
free ((char *) lines[ord[0]].lines);
for (i = ord[0]; i < nfps; ++i)
{
fps[i] = fps[i + 1];
buffer[i] = buffer[i + 1];
lines[i] = lines[i + 1];
cur[i] = cur[i + 1];
}
for (i = 0; i < nfps; ++i)
ord[i] = ord[i + 1];
continue;
}
/* The new line just read in may be larger than other lines
already in core; push it back in the queue until we encounter
a line larger than it. */
for (i = 1; i < nfps; ++i)
{
t = compare (&lines[ord[0]].lines[cur[ord[0]]],
&lines[ord[i]].lines[cur[ord[i]]]);
if (!t)
t = ord[0] - ord[i];
if (t < 0)
break;
}
t = ord[0];
for (j = 1; j < i; ++j)
ord[j - 1] = ord[j];
ord[i - 1] = t;
}
if (unique && savedflag)
{
xfwrite (saved.text, 1, saved.length, ofp);
putc ('\n', ofp);
free (saved.text);
}
}
/* Sort the array LINES with NLINES members, using TEMP for temporary space. */
static void
sortlines (lines, nlines, temp)
struct line *lines, *temp;
int nlines;
{
register struct line *lo, *hi, *t;
register int nlo, nhi;
if (nlines == 2)
{
if (compare (&lines[0], &lines[1]) > 0)
*temp = lines[0], lines[0] = lines[1], lines[1] = *temp;
return;
}
nlo = nlines / 2;
lo = lines;
nhi = nlines - nlo;
hi = lines + nlo;
if (nlo > 1)
sortlines (lo, nlo, temp);
if (nhi > 1)
sortlines (hi, nhi, temp);
t = temp;
while (nlo && nhi)
if (compare (lo, hi) <= 0)
*t++ = *lo++, --nlo;
else
*t++ = *hi++, --nhi;
while (nlo--)
*t++ = *lo++;
for (lo = lines, nlo = nlines - nhi, t = temp; nlo; --nlo)
*lo++ = *t++;
}
/* Check that each of the NFILES FILES is ordered.
Return a count of disordered files. */
static int
check (files, nfiles)
char *files[];
int nfiles;
{
int i, disorders = 0;
FILE *fp;
for (i = 0; i < nfiles; ++i)
{
fp = xfopen (files[i], "r");
if (!checkfp (fp))
{
printf ("%s: disorder on %s\n", program_name, files[i]);
++disorders;
}
}
return disorders;
}
/* Merge NFILES FILES onto OFP. */
static void
merge (files, nfiles, ofp)
char *files[];
int nfiles;
FILE *ofp;
{
int i, j, t;
char *temp;
FILE *fps[NMERGE], *tfp;
while (nfiles > NMERGE)
{
t = 0;
for (i = 0; i < nfiles / NMERGE; ++i)
{
for (j = 0; j < NMERGE; ++j)
fps[j] = xfopen (files[i * NMERGE + j], "r");
tfp = xfopen (temp = tempname (), "w");
mergefps (fps, NMERGE, tfp);
xfclose (tfp);
for (j = 0; j < NMERGE; ++j)
zaptemp (files[i * NMERGE + j]);
files[t++] = temp;
}
for (j = 0; j < nfiles % NMERGE; ++j)
fps[j] = xfopen (files[i * NMERGE + j], "r");
tfp = xfopen (temp = tempname (), "w");
mergefps (fps, nfiles % NMERGE, tfp);
xfclose (tfp);
for (j = 0; j < nfiles % NMERGE; ++j)
zaptemp (files[i * NMERGE + j]);
files[t++] = temp;
nfiles = t;
}
for (i = 0; i < nfiles; ++i)
fps[i] = xfopen (files[i], "r");
mergefps (fps, i, ofp);
for (i = 0; i < nfiles; ++i)
zaptemp (files[i]);
}
/* Sort NFILES FILES onto OFP. */
static void
sort (files, nfiles, ofp)
char **files;
int nfiles;
FILE *ofp;
{
struct buffer buf;
struct lines lines;
struct line *tmp;
int i, ntmp;
FILE *fp, *tfp;
struct tempnode *node;
int ntemp = 0;
char **tempfiles;
initbuf (&buf, sortalloc);
initlines (&lines, sortalloc / linelength + 1,
LINEALLOC / sizeof (struct line));
ntmp = lines.alloc;
tmp = (struct line *) xmalloc (ntmp * sizeof (struct line));
while (nfiles--)
{
fp = xfopen (*files++, "r");
while (fillbuf (&buf, fp))
{
findlines (&buf, &lines);
if (lines.used > ntmp)
{
while (lines.used > ntmp)
ntmp *= 2;
tmp = (struct line *)
xrealloc ((char *) tmp, ntmp * sizeof (struct line));
}
sortlines (lines.lines, lines.used, tmp);
if (feof (fp) && !nfiles && !ntemp && !buf.left)
tfp = ofp;
else
{
++ntemp;
tfp = xfopen (tempname (), "w");
}
for (i = 0; i < lines.used; ++i)
if (!unique || i == 0
|| compare (&lines.lines[i], &lines.lines[i - 1]))
{
xfwrite (lines.lines[i].text, 1, lines.lines[i].length, tfp);
putc ('\n', tfp);
}
if (tfp != ofp)
xfclose (tfp);
}
xfclose (fp);
}
free (buf.buf);
free ((char *) lines.lines);
free ((char *) tmp);
if (ntemp)
{
tempfiles = (char **) xmalloc (ntemp * sizeof (char *));
i = ntemp;
for (node = temphead.next; i > 0; node = node->next)
tempfiles[--i] = node->name;
merge (tempfiles, ntemp, ofp);
free ((char *) tempfiles);
}
}
/* Insert key KEY at the end of the list (`keyhead'). */
static void
insertkey (key)
struct keyfield *key;
{
struct keyfield *k = &keyhead;
while (k->next)
k = k->next;
k->next = key;
key->next = NULL;
}
static void
badfieldspec (s)
char *s;
{
error (2, 0, "invalid field specification `%s'", s);
}
/* Handle interrupts and hangups. */
static void
sighandler (sig)
int sig;
{
#ifdef _POSIX_VERSION
struct sigaction sigact;
sigact.sa_handler = SIG_DFL;
sigemptyset (&sigact.sa_mask);
sigact.sa_flags = 0;
sigaction (sig, &sigact, NULL);
#else /* !_POSIX_VERSION */
signal (sig, SIG_DFL);
#endif /* _POSIX_VERSION */
cleanup ();
kill (getpid (), sig);
}
/* Set the ordering options for KEY specified in S.
Return the address of the first character in S that
is not a valid ordering option.
BLANKTYPE is the kind of blanks that 'b' should skip. */
static char *
set_ordering (s, key, blanktype)
register char *s;
struct keyfield *key;
enum blanktype blanktype;
{
while (*s)
{
switch (*s)
{
case 'b':
if (blanktype == bl_start || blanktype == bl_both)
key->skipsblanks = 1;
if (blanktype == bl_end || blanktype == bl_both)
key->skipeblanks = 1;
break;
case 'd':
key->ignore = nondictionary;
break;
case 'f':
key->translate = fold_toupper;
break;
#if 0
case 'g':
/* Reserved for comparing floating-point numbers. */
break;
#endif
case 'i':
key->ignore = nonprinting;
break;
case 'M':
key->month = 1;
break;
case 'n':
key->numeric = 1;
break;
case 'r':
key->reverse = 1;
break;
default:
return s;
}
++s;
}
return s;
}
int
main (argc, argv)
int argc;
char *argv[];
{
struct keyfield *key = NULL, gkey;
char *s;
int i, t, t2;
int checkonly = 0, mergeonly = 0, nfiles = 0;
char *minus = "-", *outfile = minus, **files, *tmp;
FILE *ofp;
#ifdef _POSIX_VERSION
struct sigaction oldact, newact;
#endif /* _POSIX_VERSION */
program_name = argv[0];
#if 0
parse_long_options (argc, argv, usage);
#endif
have_read_stdin = 0;
inittables ();
temp_file_prefix = getenv ("TMPDIR");
if (temp_file_prefix == NULL)
temp_file_prefix = "/tmp";
#ifdef _POSIX_VERSION
newact.sa_handler = sighandler;
sigemptyset (&newact.sa_mask);
newact.sa_flags = 0;
sigaction (SIGINT, NULL, &oldact);
if (oldact.sa_handler != SIG_IGN)
sigaction (SIGINT, &newact, NULL);
sigaction (SIGHUP, NULL, &oldact);
if (oldact.sa_handler != SIG_IGN)
sigaction (SIGHUP, &newact, NULL);
sigaction (SIGPIPE, NULL, &oldact);
if (oldact.sa_handler != SIG_IGN)
sigaction (SIGPIPE, &newact, NULL);
sigaction (SIGTERM, NULL, &oldact);
if (oldact.sa_handler != SIG_IGN)
sigaction (SIGTERM, &newact, NULL);
#else /* !_POSIX_VERSION */
if (signal (SIGINT, SIG_IGN) != SIG_IGN)
signal (SIGINT, sighandler);
if (signal (SIGHUP, SIG_IGN) != SIG_IGN)
signal (SIGHUP, sighandler);
if (signal (SIGPIPE, SIG_IGN) != SIG_IGN)
signal (SIGPIPE, sighandler);
if (signal (SIGTERM, SIG_IGN) != SIG_IGN)
signal (SIGTERM, sighandler);
#endif /* !_POSIX_VERSION */
gkey.sword = gkey.eword = -1;
gkey.ignore = NULL;
gkey.translate = NULL;
gkey.numeric = gkey.month = gkey.reverse = 0;
gkey.skipsblanks = gkey.skipeblanks = 0;
files = (char **) xmalloc (sizeof (char *) * argc);
for (i = 1; i < argc; ++i)
{
if (argv[i][0] == '+')
{
if (key)
insertkey (key);
key = (struct keyfield *) xmalloc (sizeof (struct keyfield));
key->eword = -1;
key->ignore = NULL;
key->translate = NULL;
key->skipsblanks = key->skipeblanks = 0;
key->numeric = key->month = key->reverse = 0;
s = argv[i] + 1;
if (!digits[UCHAR (*s)])
badfieldspec (argv[i]);
for (t = 0; digits[UCHAR (*s)]; ++s)
t = 10 * t + *s - '0';
t2 = 0;
if (*s == '.')
for (++s; digits[UCHAR (*s)]; ++s)
t2 = 10 * t2 + *s - '0';
if (t2 || t)
{
key->sword = t;
key->schar = t2;
}
else
key->sword = -1;
s = set_ordering (s, key, bl_start);
if (*s)
badfieldspec (argv[i]);
}
else if (argv[i][0] == '-' && argv[i][1])
{
s = argv[i] + 1;
if (digits[UCHAR (*s)])
{
if (!key)
usage (2);
for (t = 0; digits[UCHAR (*s)]; ++s)
t = t * 10 + *s - '0';
t2 = 0;
if (*s == '.')
for (++s; digits[UCHAR (*s)]; ++s)
t2 = t2 * 10 + *s - '0';
key->eword = t;
key->echar = t2;
s = set_ordering (s, key, bl_end);
if (*s)
badfieldspec (argv[i]);
insertkey (key);
key = NULL;
}
else
while (*s)
{
s = set_ordering (s, &gkey, bl_both);
switch (*s)
{
case '\0':
break;
case 'c':
checkonly = 1;
break;
case 'k':
if (s[1])
++s;
else
{
if (i == argc - 1)
error (2, 0, "option `-k' requires an argument");
else
s = argv[++i];
}
if (key)
insertkey (key);
key = (struct keyfield *)
xmalloc (sizeof (struct keyfield));
key->eword = -1;
key->ignore = NULL;
key->translate = NULL;
key->skipsblanks = key->skipeblanks = 0;
key->numeric = key->month = key->reverse = 0;
/* Get POS1. */
if (!digits[UCHAR (*s)])
badfieldspec (argv[i]);
for (t = 0; digits[UCHAR (*s)]; ++s)
t = 10 * t + *s - '0';
if (t)
t--;
t2 = 0;
if (*s == '.')
{
for (++s; digits[UCHAR (*s)]; ++s)
t2 = 10 * t2 + *s - '0';
if (t2)
t2--;
}
if (t2 || t)
{
key->sword = t;
key->schar = t2;
}
else
key->sword = -1;
s = set_ordering (s, key, bl_start);
if (*s && *s != ',')
badfieldspec (argv[i]);
else if (*s++)
{
/* Get POS2. */
for (t = 0; digits[UCHAR (*s)]; ++s)
t = t * 10 + *s - '0';
t2 = 0;
if (*s == '.')
{
for (++s; digits[UCHAR (*s)]; ++s)
t2 = t2 * 10 + *s - '0';
if (t2)
t--;
}
key->eword = t;
key->echar = t2;
s = set_ordering (s, key, bl_end);
if (*s)
badfieldspec (argv[i]);
}
insertkey (key);
key = NULL;
goto outer;
case 'm':
mergeonly = 1;
break;
case 'o':
if (s[1])
outfile = s + 1;
else
{
if (i == argc - 1)
error (2, 0, "option `-o' requires an argument");
else
outfile = argv[++i];
}
goto outer;
case 's':
stable = 1;
break;
case 't':
if (s[1])
tab = *++s;
else if (i < argc - 1)
{
tab = *argv[++i];
goto outer;
}
else
error (2, 0, "option `-t' requires an argument");
break;
case 'T':
if (s[1])
temp_file_prefix = ++s;
else if (i < argc - 1)
{
temp_file_prefix = argv[++i];
goto outer;
}
else
error (2, 0, "option `-T' requires an argument");
break;
case 'u':
unique = 1;
break;
case 'y':
/* Accept and ignore e.g. -y0 for compatibility with
Solaris 2. */
goto outer;
default:
fprintf (stderr, "%s: unrecognized option `-%c'\n",
argv[0], *s);
usage (2);
}
if (*s)
++s;
}
}
else /* Not an option. */
{
files[nfiles++] = argv[i];
}
outer:;
}
if (key)
insertkey (key);
/* Inheritance of global options to individual keys. */
for (key = keyhead.next; key; key = key->next)
if (!key->ignore && !key->translate && !key->skipsblanks && !key->reverse
&& !key->skipeblanks && !key->month && !key->numeric)
{
key->ignore = gkey.ignore;
key->translate = gkey.translate;
key->skipsblanks = gkey.skipsblanks;
key->skipeblanks = gkey.skipeblanks;
key->month = gkey.month;
key->numeric = gkey.numeric;
key->reverse = gkey.reverse;
}
if (!keyhead.next && (gkey.ignore || gkey.translate || gkey.skipsblanks
|| gkey.skipeblanks || gkey.month || gkey.numeric))
insertkey (&gkey);
reverse = gkey.reverse;
if (nfiles == 0)
{
nfiles = 1;
files = &minus;
}
if (checkonly)
exit (check (files, nfiles) != 0);
if (strcmp (outfile, "-"))
{
for (i = 0; i < nfiles; ++i)
if (!strcmp (outfile, files[i]))
break;
if (i == nfiles)
ofp = xfopen (outfile, "w");
else
{
char buf[8192];
FILE *fp = xfopen (outfile, "r");
int cc;
tmp = tempname ();
ofp = xfopen (tmp, "w");
while ((cc = fread (buf, 1, sizeof buf, fp)) > 0)
xfwrite (buf, 1, cc, ofp);
if (ferror (fp))
{
error (0, errno, "%s", outfile);
cleanup ();
exit (2);
}
xfclose (ofp);
xfclose (fp);
files[i] = tmp;
ofp = xfopen (outfile, "w");
}
}
else
ofp = stdout;
if (mergeonly)
merge (files, nfiles, ofp);
else
sort (files, nfiles, ofp);
cleanup ();
/* If we wait for the implicit flush on exit, and the parent process
has closed stdout (e.g., exec >&- in a shell), then the output file
winds up empty. I don't understand why. This is under SunOS,
Solaris, Ultrix, and Irix. This premature fflush makes the output
reappear. --karl@cs.umb.edu */
if (fflush (ofp) < 0)
error (1, errno, "fflush", outfile);
if (have_read_stdin && fclose (stdin) == EOF)
error (1, errno, "-");
if (ferror (stdout) || fclose (stdout) == EOF)
error (1, errno, "write error");
exit (0);
}
#ifdef BUILTIN_HELP
static void
usage (status)
int status;
{
if (status != 0)
fprintf (stderr, "Try `%s --help' for more information.\n",
program_name);
else
{
printf ("\
Usage: %s [OPTION]... [FILE]...\n\
",
program_name);
printf ("\
\n\
+POS1 [-POS2] start a key at POS1, end it before POS2\n\
-M compare (unknown) < `JAN' < ... < `DEC', imply -b\n\
-T DIRECT use DIRECTfor temporary files, not $TEMPDIR nor /tmp\n\
-b ignore leading blanks in sort fields or keys\n\
-c check if given files already sorted, do not sort\n\
-d consider only [a-zA-Z0-9 ] characters in keys\n\
-f fold lower case to upper case characters in keys\n\
-i consider only [\\040-\\0176] characters in keys\n\
-k POS1[,POS2] same as +POS1 [-POS2], but all positions counted from 1\n\
-m merge already sorted files, do not sort\n\
-n compare according to string numerical value, imply -b\n\
-o FILE write result on FILE instead of standard output\n\
-r reverse the result of comparisons\n\
-s stabilize sort by disabling last resort comparison\n\
-t SEP use SEParator instead of non- to whitespace transition\n\
-u with -c, check for strict ordering\n\
-u with -m, only output the first of an equal sequence\n\
--help display this help and exit\n\
--version output version information and exit\n\
\n\
POS is F[.C][OPTS], where F is the field number and C the character\n\
position in the field, both counted from zero. OPTS is made up of one\n\
or more of Mbdfinr, this effectively disable global -Mbdfinr settings\n\
for that key. If no key given, use the entire line as key. With no\n\
FILE, or when FILE is -, read standard input.\n\
");
}
exit (status);
}
#else /* use the man page */
static void
usage (status)
int status;
{
fprintf (stderr, "\
Usage: %s [-cmus] [-t separator] [-o output-file] [-bdfiMnr] [+POS1 [-POS2]]\n\
[-k POS1[,POS2]] [file...]\n",
program_name);
exit (status);
}
#endif
error (n, e, s, s1)
int n, e;
char *s, *s1;
{
if (e)
fprintf(stderr,"error %d:", e);
fprintf(stderr, s, s1);
if (n)
exit(n);
}