NetBSD/usr.bin/sort/msort.c
joerg 2b8a053617 Retire __SCCSID. It has only archeological value now. Also retire lint
conditional around __RCSID, lint can handle that fine.
2009-11-06 18:34:22 +00:00

428 lines
12 KiB
C

/* $NetBSD: msort.c,v 1.29 2009/11/06 18:34:22 joerg Exp $ */
/*-
* Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Ben Harris and Jaromir Dolecek.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Peter McIlroy.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "sort.h"
#include "fsort.h"
__RCSID("$NetBSD: msort.c,v 1.29 2009/11/06 18:34:22 joerg Exp $");
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <util.h>
/* Subroutines using comparisons: merge sort and check order */
#define DELETE (1)
typedef struct mfile {
FILE *fp;
get_func_t get;
RECHEADER *rec;
u_char *end;
} MFILE;
static int cmp(RECHEADER *, RECHEADER *);
static int insert(struct mfile **, struct mfile *, int, int);
static void merge_sort_fstack(FILE *, put_func_t, struct field *);
/*
* Number of files merge() can merge in one pass.
*/
#define MERGE_FNUM 16
static struct mfile fstack[MERGE_FNUM];
static struct mfile fstack_1[MERGE_FNUM];
static struct mfile fstack_2[MERGE_FNUM];
static int fstack_count, fstack_1_count, fstack_2_count;
void
save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
{
FILE *mfp, *mfp1, *mfp2;
if (fstack_count == MERGE_FNUM) {
/* Must reduce the number of temporary files */
mfp = ftmp();
merge_sort_fstack(mfp, putrec, ftbl);
/* Save output in next layer */
if (fstack_1_count == MERGE_FNUM) {
mfp1 = ftmp();
memcpy(fstack, fstack_1, sizeof fstack);
merge_sort_fstack(mfp1, putrec, ftbl);
if (fstack_2_count == MERGE_FNUM) {
/* More than 4096 files! */
mfp2 = ftmp();
memcpy(fstack, fstack_2, sizeof fstack);
merge_sort_fstack(mfp2, putrec, ftbl);
fstack_2[0].fp = mfp2;
fstack_2_count = 1;
}
fstack_2[fstack_2_count].fp = mfp1;
fstack_2[fstack_2_count].get = geteasy;
fstack_2_count++;
fstack_1_count = 0;
}
fstack_1[fstack_1_count].fp = mfp;
fstack_1[fstack_1_count].get = geteasy;
fstack_1_count++;
fstack_count = 0;
}
fstack[fstack_count].fp = fp;
fstack[fstack_count++].get = get;
}
void
fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
{
get_func_t get = SINGL_FLD ? makeline : makekey;
FILE *fp;
int i;
for (i = 0; i < nfiles; i++) {
fp = fopen(filelist->names[i], "r");
if (fp == NULL)
err(2, "%s", filelist->names[i]);
save_for_merge(fp, get, ftbl);
}
merge_sort(outfp, putline, ftbl);
}
void
merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
{
int count = fstack_1_count + fstack_2_count;
FILE *mfp;
int i;
if (count == 0) {
/* All files in initial array */
merge_sort_fstack(outfp, put, ftbl);
return;
}
count += fstack_count;
/* Too many files for one merge sort */
for (;;) {
/* Sort latest 16 files */
i = count;
if (i > MERGE_FNUM)
i = MERGE_FNUM;
while (fstack_count > 0)
fstack[--i] = fstack[--fstack_count];
while (i > 0 && fstack_1_count > 0)
fstack[--i] = fstack_1[--fstack_1_count];
while (i > 0)
fstack[--i] = fstack_2[--fstack_2_count];
if (count <= MERGE_FNUM) {
/* Got all the data */
fstack_count = count;
merge_sort_fstack(outfp, put, ftbl);
return;
}
mfp = ftmp();
fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
merge_sort_fstack(mfp, putrec, ftbl);
fstack[0].fp = mfp;
fstack[0].get = geteasy;
fstack_count = 1;
count -= MERGE_FNUM - 1;
}
}
static void
merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
{
struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
RECHEADER *new_rec;
u_char *new_end;
void *tmp;
int c, i, nfiles;
size_t sz;
/* Read one record from each file (read again if a duplicate) */
for (nfiles = i = 0; i < fstack_count; i++) {
cfile = &fstack[i];
if (cfile->rec == NULL) {
cfile->rec = emalloc(DEFLLEN);
cfile->end = (u_char *)cfile->rec + DEFLLEN;
}
rewind(cfile->fp);
for (;;) {
c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
if (c == EOF)
break;
if (c == BUFFEND) {
/* Double buffer size */
sz = (cfile->end - (u_char *)cfile->rec) * 2;
cfile->rec = erealloc(cfile->rec, sz);
cfile->end = (u_char *)cfile->rec + sz;
continue;
}
if (nfiles != 0) {
if (insert(flist, cfile, nfiles, !DELETE))
/* Duplicate removed */
continue;
} else
flist[0] = cfile;
nfiles++;
break;
}
}
if (nfiles == 0)
return;
/*
* We now loop reading a new record from the file with the
* 'sorted first' existing record.
* As each record is added, the 'first' record is written to the
* output file - maintaining one record from each file in the sorted
* list.
*/
new_rec = emalloc(DEFLLEN);
new_end = (u_char *)new_rec + DEFLLEN;
for (;;) {
cfile = flist[0];
c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
if (c == EOF) {
/* Write out last record from now-empty input */
put(cfile->rec, outfp);
if (--nfiles == 0)
break;
/* Replace from file with now-first sorted record. */
/* (Moving base 'flist' saves copying everything!) */
flist++;
continue;
}
if (c == BUFFEND) {
/* Buffer not large enough - double in size */
sz = (new_end - (u_char *)new_rec) * 2;
new_rec = erealloc(new_rec, sz);
new_end = (u_char *)new_rec +sz;
continue;
}
/* Swap in new buffer, saving old */
tmp = cfile->rec;
cfile->rec = new_rec;
new_rec = tmp;
tmp = cfile->end;
cfile->end = new_end;
new_end = tmp;
/* Add into sort, removing the original first entry */
c = insert(flist, cfile, nfiles, DELETE);
if (c != 0 || (UNIQUE && cfile == flist[0]
&& cmp(new_rec, cfile->rec) == 0)) {
/* Was an unwanted duplicate, restore buffer */
tmp = cfile->rec;
cfile->rec = new_rec;
new_rec = tmp;
tmp = cfile->end;
cfile->end = new_end;
new_end = tmp;
continue;
}
/* Write out 'old' record */
put(new_rec, outfp);
}
free(new_rec);
}
/*
* if delete: inserts rec in flist, deletes flist[0];
* otherwise just inserts *rec in flist.
* Returns 1 if record is a duplicate to be ignored.
*/
static int
insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
{
int mid, top = ttop, bot = 0, cmpv = 1;
for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
cmpv = cmp(rec->rec, flist[mid]->rec);
if (cmpv == 0 ) {
if (UNIQUE)
/* Duplicate key, read another record */
/* NB: This doesn't guarantee to keep any
* particular record. */
return 1;
/*
* Apply sort by input file order.
* We could truncate the sort is the fileno are
* adjacent - but that is all too hard!
* The fileno cannot be equal, since we only have one
* record from each file (+ flist[0] which never
* comes here).
*/
cmpv = rec < flist[mid] ? -1 : 1;
if (REVERSE)
cmpv = -cmpv;
}
if (cmpv < 0)
top = mid;
else
bot = mid;
}
/* At this point we haven't yet compared against flist[0] */
if (delete) {
/* flist[0] is ourselves, only the caller knows the old data */
if (bot != 0) {
memmove(flist, flist + 1, bot * sizeof(MFILE *));
flist[bot] = rec;
}
return 0;
}
/* Inserting original set of records */
if (bot == 0 && cmpv != 0) {
/* Doesn't match flist[1], must compare with flist[0] */
cmpv = cmp(rec->rec, flist[0]->rec);
if (cmpv == 0 && UNIQUE)
return 1;
/* Add matching keys in file order (ie new is later) */
if (cmpv < 0)
bot = -1;
}
bot++;
memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
flist[bot] = rec;
return 0;
}
/*
* check order on one file
*/
void
order(struct filelist *filelist, struct field *ftbl)
{
get_func_t get = SINGL_FLD ? makeline : makekey;
RECHEADER *crec, *prec, *trec;
u_char *crec_end, *prec_end, *trec_end;
FILE *fp;
int c;
fp = fopen(filelist->names[0], "r");
if (fp == NULL)
err(2, "%s", filelist->names[0]);
crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
crec_end = crec->data + DEFLLEN;
prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
prec_end = prec->data + DEFLLEN;
/* XXX this does exit(0) for overlong lines */
if (get(fp, prec, prec_end, ftbl) != 0)
exit(0);
while (get(fp, crec, crec_end, ftbl) == 0) {
if (0 < (c = cmp(prec, crec))) {
crec->data[crec->length-1] = 0;
errx(1, "found disorder: %s", crec->data+crec->offset);
}
if (UNIQUE && !c) {
crec->data[crec->length-1] = 0;
errx(1, "found non-uniqueness: %s",
crec->data+crec->offset);
}
/*
* Swap pointers so that this record is on place pointed
* to by prec and new record is read to place pointed to by
* crec.
*/
trec = prec;
prec = crec;
crec = trec;
trec_end = prec_end;
prec_end = crec_end;
crec_end = trec_end;
}
exit(0);
}
static int
cmp(RECHEADER *rec1, RECHEADER *rec2)
{
int len;
int r;
/* key is weights */
len = min(rec1->keylen, rec2->keylen);
r = memcmp(rec1->data, rec2->data, len);
if (r == 0)
r = rec1->keylen - rec2->keylen;
if (REVERSE)
r = -r;
return r;
}