NetBSD/usr.bin/spell/spellprog/spellprog.c

836 lines
19 KiB
C

/* $NetBSD: spellprog.c,v 1.5 2006/08/26 18:17:43 christos Exp $ */
/* derived from OpenBSD: spellprog.c,v 1.4 2003/06/03 02:56:16 millert Exp */
/*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)spell.h 8.1 (Berkeley) 6/6/93
*/
/*
* Copyright (C) Caldera International Inc. 2001-2002.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code and documentation must retain the above
* copyright notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed or owned by Caldera
* International, Inc.
* 4. Neither the name of Caldera International, Inc. nor the names of other
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
* INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
* INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef lint
static const char copyright[] =
"@(#) Copyright (c) 1991, 1993\n\
The Regents of the University of California. All rights reserved.\n";
#endif /* not lint */
#ifndef lint
#if 0
static const char sccsid[] = "@(#)spell.c 8.1 (Berkeley) 6/6/93";
#else
#endif
static const char rcsid[] = "$OpenBSD: spellprog.c,v 1.4 2003/06/03 02:56:16 millert Exp $";
#endif /* not lint */
#include <sys/param.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <util.h>
#include "extern.h"
#define DLEV 2
static int dict(char *, char *);
static int trypref(char *, const char *, size_t);
static int tryword(char *, char *, size_t);
static int suffix(char *, size_t);
static int vowel(int);
static const char *lookuppref(char **, char *);
static char *skipv(char *);
static void ise(void);
static void print_word(FILE *);
static void ztos(char *);
static int monosyl(char *, char *);
static void usage(void) __attribute__((__noreturn__));
static void getderiv(size_t);
static int an(char *, const char *, const char *, size_t);
static int bility(char *, const char *, const char *, size_t);
static int es(char *, const char *, const char *, size_t);
static int i_to_y(char *, const char *, const char *, size_t);
static int ily(char *, const char *, const char *, size_t);
static int ize(char *, const char *, const char *, size_t);
static int metry(char *, const char *, const char *, size_t);
static int ncy(char *, const char *, const char *, size_t);
static int nop(char *, const char *, const char *, size_t);
static int s(char *, const char *, const char *, size_t);
static int strip(char *, const char *, const char *, size_t);
static int tion(char *, const char *, const char *, size_t);
static int y_to_e(char *, const char *, const char *, size_t);
static int CCe(char *, const char *, const char *, size_t);
static int VCe(char *, const char *, const char *, size_t);
/*
* This cannot be const because we modify it when we choose british
* spelling.
*/
static struct suftab {
const char *suf;
int (*p1)(char *, const char *, const char *, size_t);
int n1;
const char *d1;
const char *a1;
int (*p2)(char *, const char *, const char *, size_t);
int n2;
const char *d2;
const char *a2;
} suftab[] = {
{"ssen", ily, 4, "-y+iness", "+ness" },
{"ssel", ily, 4, "-y+i+less", "+less" },
{"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" },
{"s'", s, 2, "", "+'s"},
{"s", s, 1, "", "+s"},
{"ecn", ncy, 1, "", "-t+ce"},
{"ycn", ncy, 1, "", "-cy+t"},
{"ytilb", nop, 0, "", ""},
{"ytilib", bility, 5, "-le+ility", ""},
{"elbaif", i_to_y, 4, "-y+iable", ""},
{"elba", CCe, 4, "-e+able", "+able"},
{"yti", CCe, 3, "-e+ity", "+ity"},
{"ylb", y_to_e, 1, "-e+y", ""},
{"yl", ily, 2, "-y+ily", "+ly"},
{"laci", strip, 2, "", "+al"},
{"latnem", strip, 2, "", "+al"},
{"lanoi", strip, 2, "", "+al"},
{"tnem", strip, 4, "", "+ment"},
{"gni", CCe, 3, "-e+ing", "+ing"},
{"reta", nop, 0, "", ""},
{"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
{"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
{"citsi", strip, 2, "", "+ic"},
{"cihparg", i_to_y, 1, "-y+ic", ""},
{"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
{"cirtem", i_to_y, 1, "-y+ic", ""},
{"yrtem", metry, 0, "-ry+er", ""},
{"cigol", i_to_y, 1, "-y+ic", ""},
{"tsigol", i_to_y, 2, "-y+ist", ""},
{"tsi", VCe, 3, "-e+ist", "+ist"},
{"msi", VCe, 3, "-e+ism", "+ist"},
{"noitacif", i_to_y, 6, "-y+ication", ""},
{"noitazi", ize, 5, "-e+ation", ""},
{"rota", tion, 2, "-e+or", ""},
{"noit", tion, 3, "-e+ion", "+ion"},
{"naino", an, 3, "", "+ian"},
{"na", an, 1, "", "+n"},
{"evit", tion, 3, "-e+ive", "+ive"},
{"ezi", CCe, 3, "-e+ize", "+ize"},
{"pihs", strip, 4, "", "+ship"},
{"dooh", ily, 4, "-y+hood", "+hood"},
{"ekil", strip, 4, "", "+like"},
{ NULL, }
};
static const char *preftab[] = {
"anti",
"bio",
"dis",
"electro",
"en",
"fore",
"hyper",
"intra",
"inter",
"iso",
"kilo",
"magneto",
"meta",
"micro",
"milli",
"mis",
"mono",
"multi",
"non",
"out",
"over",
"photo",
"poly",
"pre",
"pseudo",
"re",
"semi",
"stereo",
"sub",
"super",
"thermo",
"ultra",
"under", /* must precede un */
"un",
NULL
};
static struct wlist {
int fd;
unsigned char *front;
unsigned char *back;
} *wlists;
static int vflag;
static int xflag;
static char word[LINE_MAX];
static char original[LINE_MAX];
static char affix[LINE_MAX];
static struct {
const char **buf;
size_t maxlev;
} deriv;
/*
* The spellprog utility accepts a newline-delimited list of words
* on stdin. For arguments it expects the path to a word list and
* the path to a file in which to store found words.
*
* In normal usage, spell is called twice. The first time it is
* called with a stop list to flag commonly mispelled words. The
* remaining words are then passed to spell again, this time with
* the dictionary file as the first (non-flag) argument.
*
* Unlike historic versions of spellprog, this one does not use
* hashed files. Instead it simply requires that files be sorted
* lexigraphically and uses the same algorithm as the look utility.
*
* Note that spellprog should be called via the spell shell script
* and is not meant to be invoked directly by the user.
*/
int
main(int argc, char **argv)
{
char *ep, *cp, *dp;
char *outfile;
int ch, fold, i;
struct stat sb;
FILE *file, *found;
setlocale(LC_ALL, "");
outfile = NULL;
while ((ch = getopt(argc, argv, "bvxo:")) != -1) {
switch (ch) {
case 'b':
/* Use British dictionary and convert ize -> ise. */
ise();
break;
case 'o':
outfile = optarg;
break;
case 'v':
/* Also write derivations to "found" file. */
vflag++;
break;
case 'x':
/* Print plausible stems to stdout. */
xflag++;
break;
default:
usage();
}
}
argc -= optind;
argv += optind;
if (argc < 1)
usage();
/* Open and mmap the word/stop lists. */
if ((wlists = malloc(sizeof(struct wlist) * (argc + 1))) == NULL)
err(1, "malloc");
for (i = 0; argc--; i++) {
wlists[i].fd = open(argv[i], O_RDONLY, 0);
if (wlists[i].fd == -1 || fstat(wlists[i].fd, &sb) != 0)
err(1, "%s", argv[i]);
if (sb.st_size > SIZE_T_MAX)
errx(1, "%s: %s", argv[i], strerror(EFBIG));
wlists[i].front = mmap(NULL, (size_t)sb.st_size, PROT_READ,
MAP_PRIVATE, wlists[i].fd, (off_t)0);
if (wlists[i].front == MAP_FAILED)
err(1, "%s", argv[i]);
wlists[i].back = wlists[i].front + (size_t)sb.st_size;
}
wlists[i].fd = -1;
/* Open file where found words are to be saved. */
if (outfile == NULL)
found = NULL;
else if ((found = fopen(outfile, "w")) == NULL)
err(1, "cannot open %s", outfile);
for (;; print_word(file)) {
affix[0] = '\0';
file = found;
for (ep = word; (*ep = ch = getchar()) != '\n'; ep++) {
if (ep - word == sizeof(word) - 1) {
*ep = '\0';
warnx("word too long (%s)", word);
while ((ch = getchar()) != '\n')
; /* slurp until EOL */
}
if (ch == EOF) {
if (found != NULL)
fclose(found);
exit(0);
}
}
for (cp = word, dp = original; cp < ep; )
*dp++ = *cp++;
*dp = '\0';
fold = 0;
for (cp = word; cp < ep; cp++)
if (islower((unsigned char)*cp))
goto lcase;
if (trypref(ep, ".", 0))
continue;
++fold;
for (cp = original + 1, dp = word + 1; dp < ep; dp++, cp++)
*dp = tolower((unsigned char)*cp);
lcase:
if (trypref(ep, ".", 0) || suffix(ep, 0))
continue;
if (isupper((unsigned char)word[0])) {
for (cp = original, dp = word; (*dp = *cp++); dp++) {
if (fold)
*dp = tolower((unsigned char)*dp);
}
word[0] = tolower((unsigned char)word[0]);
goto lcase;
}
file = stdout;
}
}
static void
print_word(FILE *f)
{
if (f != NULL) {
if (vflag && affix[0] != '\0' && affix[0] != '.')
fprintf(f, "%s\t%s\n", affix, original);
else
fprintf(f, "%s\n", original);
}
}
/*
* For each matching suffix in suftab, call the function associated
* with that suffix (p1 and p2).
*/
static int
suffix(char *ep, size_t lev)
{
const struct suftab *t;
char *cp;
const char *sp;
lev += DLEV;
getderiv(lev + 1);
deriv.buf[lev] = deriv.buf[lev - 1] = 0;
for (t = suftab; (sp = t->suf) != NULL; t++) {
cp = ep;
while (*sp) {
if (*--cp != *sp++)
goto next;
}
for (sp = cp; --sp >= word && !vowel(*sp);)
; /* nothing */
if (sp < word)
return 0;
if ((*t->p1)(ep - t->n1, t->d1, t->a1, lev + 1))
return 1;
if (t->p2 != NULL) {
deriv.buf[lev] = deriv.buf[lev + 1] = '\0';
return (*t->p2)(ep - t->n2, t->d2, t->a2, lev);
}
return 0;
next: ;
}
return 0;
}
static int
/*ARGSUSED*/
nop(char *ep, const char *d, const char *a, size_t lev)
{
return 0;
}
static int
/*ARGSUSED*/
strip(char *ep, const char *d, const char *a, size_t lev)
{
return trypref(ep, a, lev) || suffix(ep, lev);
}
static int
s(char *ep, const char *d, const char *a, const size_t lev)
{
if (lev > DLEV + 1)
return 0;
if (*ep == 's' && ep[-1] == 's')
return 0;
return strip(ep, d, a, lev);
}
static int
/*ARGSUSED*/
an(char *ep, const char *d, const char *a, size_t lev)
{
if (!isupper((unsigned char)*word)) /* must be proper name */
return 0;
return trypref(ep, a, lev);
}
static int
/*ARGSUSED*/
ize(char *ep, const char *d, const char *a, size_t lev)
{
*ep++ = 'e';
return strip(ep ,"", d, lev);
}
static int
/*ARGSUSED*/
y_to_e(char *ep, const char *d, const char *a, size_t lev)
{
char c = *ep;
*ep++ = 'e';
if (strip(ep, "", d, lev))
return 1;
ep[-1] = c;
return 0;
}
static int
ily(char *ep, const char *d, const char *a, size_t lev)
{
if (ep[-1] == 'i')
return i_to_y(ep, d, a, lev);
else
return strip(ep, d, a, lev);
}
static int
ncy(char *ep, const char *d, const char *a, size_t lev)
{
if (skipv(skipv(ep - 1)) < word)
return 0;
ep[-1] = 't';
return strip(ep, d, a, lev);
}
static int
bility(char *ep, const char *d, const char *a, size_t lev)
{
*ep++ = 'l';
return y_to_e(ep, d, a, lev);
}
static int
i_to_y(char *ep, const char *d, const char *a, size_t lev)
{
if (ep[-1] == 'i') {
ep[-1] = 'y';
a = d;
}
return strip(ep, "", a, lev);
}
static int
es(char *ep, const char *d, const char *a, size_t lev)
{
if (lev > DLEV)
return 0;
switch (ep[-1]) {
default:
return 0;
case 'i':
return i_to_y(ep, d, a, lev);
case 's':
case 'h':
case 'z':
case 'x':
return strip(ep, d, a, lev);
}
}
static int
metry(char *ep, const char *d, const char *a, size_t lev)
{
ep[-2] = 'e';
ep[-1] = 'r';
return strip(ep, d, a, lev);
}
static int
tion(char *ep, const char *d, const char *a, size_t lev)
{
switch (ep[-2]) {
case 'c':
case 'r':
return trypref(ep, a, lev);
case 'a':
return y_to_e(ep, d, a, lev);
}
return 0;
}
/*
* Possible consonant-consonant-e ending.
*/
static int
CCe(char *ep, const char *d, const char *a, size_t lev)
{
switch (ep[-1]) {
case 'l':
if (vowel(ep[-2]))
break;
switch (ep[-2]) {
case 'l':
case 'r':
case 'w':
break;
default:
return y_to_e(ep, d, a, lev);
}
break;
case 's':
if (ep[-2] == 's')
break;
/*FALLTHROUGH*/
case 'c':
case 'g':
if (*ep == 'a')
return 0;
/*FALLTHROUGH*/
case 'v':
case 'z':
if (vowel(ep[-2]))
break;
/*FALLTHROUGH*/
case 'u':
if (y_to_e(ep, d, a, lev))
return 1;
if (!(ep[-2] == 'n' && ep[-1] == 'g'))
return 0;
}
return VCe(ep, d, a, lev);
}
/*
* Possible consonant-vowel-consonant-e ending.
*/
static int
VCe(char *ep, const char *d, const char *a, size_t lev)
{
char c;
c = ep[-1];
if (c == 'e')
return 0;
if (!vowel(c) && vowel(ep[-2])) {
c = *ep;
*ep++ = 'e';
if (trypref(ep, d, lev) || suffix(ep, lev))
return 1;
ep--;
*ep = c;
}
return strip(ep, d, a, lev);
}
static const char *
lookuppref(char **wp, char *ep)
{
const char **sp, *cp;
char *bp;
for (sp = preftab; *sp; sp++) {
bp = *wp;
for (cp = *sp; *cp; cp++, bp++) {
if (tolower((unsigned char)*bp) != *cp)
goto next;
}
for (cp = bp; cp < ep; cp++) {
if (vowel(*cp)) {
*wp = bp;
return *sp;
}
}
next: ;
}
return 0;
}
/*
* If the word is not in the dictionary, try stripping off prefixes
* until the word is found or we run out of prefixes to check.
*/
static int
trypref(char *ep, const char *a, size_t lev)
{
const char *cp;
char *bp;
char *pp;
int val = 0;
char space[20];
getderiv(lev + 2);
deriv.buf[lev] = a;
if (tryword(word, ep, lev))
return 1;
bp = word;
pp = space;
deriv.buf[lev + 1] = pp;
while ((cp = lookuppref(&bp, ep)) != NULL) {
*pp++ = '+';
while ((*pp = *cp++))
pp++;
if (tryword(bp, ep, lev + 1)) {
val = 1;
break;
}
if (pp - space >= sizeof(space))
return 0;
}
deriv.buf[lev + 1] = deriv.buf[lev + 2] = '\0';
return val;
}
static int
tryword(char *bp, char *ep, size_t lev)
{
size_t i, j;
char duple[3];
if (ep-bp <= 1)
return 0;
if (vowel(*ep) && monosyl(bp, ep))
return 0;
i = dict(bp, ep);
if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] &&
monosyl(bp, ep - 1)) {
ep--;
getderiv(++lev);
deriv.buf[lev] = duple;
duple[0] = '+';
duple[1] = *ep;
duple[2] = '\0';
i = dict(bp, ep);
}
if (vflag == 0 || i == 0)
return i;
/* Also tack on possible derivations. (XXX - warn on truncation?) */
for (j = lev; j > 0; j--) {
if (deriv.buf[j])
(void)strlcat(affix, deriv.buf[j], sizeof(affix));
}
return i;
}
static int
monosyl(char *bp, char *ep)
{
if (ep < bp + 2)
return 0;
if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
return 0;
while (--ep >= bp)
if (vowel(*ep))
return 0;
return 1;
}
static char *
skipv(char *st)
{
if (st >= word && vowel(*st))
st--;
while (st >= word && !vowel(*st))
st--;
return st;
}
static int
vowel(int c)
{
switch (tolower(c)) {
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
case 'y':
return 1;
}
return 0;
}
/*
* Crummy way to Britishise.
*/
static void
ise(void)
{
struct suftab *tab;
char *cp;
for (tab = suftab; tab->suf; tab++) {
/* Assume that suffix will contain 'z' if a1 or d1 do */
if (strchr(tab->suf, 'z')) {
tab->suf = cp = estrdup(tab->suf);
ztos(cp);
if (strchr(tab->d1, 'z')) {
tab->d1 = cp = estrdup(tab->d1);
ztos(cp);
}
if (strchr(tab->a1, 'z')) {
tab->a1 = cp = estrdup(tab->a1);
ztos(cp);
}
}
}
}
static void
ztos(char *st)
{
for (; *st; st++)
if (*st == 'z')
*st = 's';
}
/*
* Look up a word in the dictionary.
* Returns 1 if found, 0 if not.
*/
static int
dict(char *bp, char *ep)
{
char c;
int i, rval;
c = *ep;
*ep = '\0';
if (xflag)
printf("=%s\n", bp);
for (i = rval = 0; wlists[i].fd != -1; i++) {
if ((rval = look((unsigned char *)bp, wlists[i].front,
wlists[i].back)) == 1)
break;
}
*ep = c;
return rval;
}
static void
getderiv(size_t lev)
{
if (deriv.maxlev < lev) {
void *p = realloc(deriv.buf, sizeof(*deriv.buf) * lev);
if (p == NULL)
err(1, "Cannot grow array");
deriv.buf = p;
deriv.maxlev = lev;
}
}
static void
usage(void)
{
(void)fprintf(stderr,
"Usage: %s [-bvx] [-o found-words] word-list ...\n",
getprogname());
exit(1);
}