583 lines
13 KiB
C
583 lines
13 KiB
C
|
/*
|
||
|
* Copyright (c) 1980 The Regents of the University of California.
|
||
|
* All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* 1. Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* 3. All advertising materials mentioning features or use of this software
|
||
|
* must display the following acknowledgement:
|
||
|
* This product includes software developed by the University of
|
||
|
* California, Berkeley and its contributors.
|
||
|
* 4. Neither the name of the University nor the names of its contributors
|
||
|
* may be used to endorse or promote products derived from this software
|
||
|
* without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||
|
* SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
#ifndef lint
|
||
|
static char sccsid[] = "@(#)regexp.c 5.3 (Berkeley) 6/1/90";
|
||
|
#endif /* not lint */
|
||
|
|
||
|
#include <ctype.h>
|
||
|
|
||
|
typedef int boolean;
|
||
|
#define TRUE 1
|
||
|
#define FALSE 0
|
||
|
#define NIL 0
|
||
|
|
||
|
boolean l_onecase; /* true if upper and lower equivalent */
|
||
|
|
||
|
#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
|
||
|
|
||
|
/* STRNCMP - like strncmp except that we convert the
|
||
|
* first string to lower case before comparing
|
||
|
* if l_onecase is set.
|
||
|
*/
|
||
|
|
||
|
STRNCMP(s1, s2, len)
|
||
|
register char *s1,*s2;
|
||
|
register int len;
|
||
|
{
|
||
|
if (l_onecase) {
|
||
|
do
|
||
|
if (*s2 - makelower(*s1))
|
||
|
return (*s2 - makelower(*s1));
|
||
|
else {
|
||
|
s2++;
|
||
|
s1++;
|
||
|
}
|
||
|
while (--len);
|
||
|
} else {
|
||
|
do
|
||
|
if (*s2 - *s1)
|
||
|
return (*s2 - *s1);
|
||
|
else {
|
||
|
s2++;
|
||
|
s1++;
|
||
|
}
|
||
|
while (--len);
|
||
|
}
|
||
|
return(0);
|
||
|
}
|
||
|
|
||
|
/* The following routine converts an irregular expression to
|
||
|
* internal format.
|
||
|
*
|
||
|
* Either meta symbols (\a \d or \p) or character strings or
|
||
|
* operations ( alternation or perenthesizing ) can be
|
||
|
* specified. Each starts with a descriptor byte. The descriptor
|
||
|
* byte has STR set for strings, META set for meta symbols
|
||
|
* and OPER set for operations.
|
||
|
* The descriptor byte can also have the OPT bit set if the object
|
||
|
* defined is optional. Also ALT can be set to indicate an alternation.
|
||
|
*
|
||
|
* For metasymbols the byte following the descriptor byte identities
|
||
|
* the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
|
||
|
* strings the byte after the descriptor is a character count for
|
||
|
* the string:
|
||
|
*
|
||
|
* meta symbols := descriptor
|
||
|
* symbol
|
||
|
*
|
||
|
* strings := descriptor
|
||
|
* character count
|
||
|
* the string
|
||
|
*
|
||
|
* operatins := descriptor
|
||
|
* symbol
|
||
|
* character count
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* handy macros for accessing parts of match blocks
|
||
|
*/
|
||
|
#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
|
||
|
#define MNEXT(A) (A+2) /* character following a metasymbol block */
|
||
|
|
||
|
#define OSYM(A) (*(A+1)) /* symbol in an operation block */
|
||
|
#define OCNT(A) (*(A+2)) /* character count */
|
||
|
#define ONEXT(A) (A+3) /* next character after the operation */
|
||
|
#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
|
||
|
|
||
|
#define SCNT(A) (*(A+1)) /* byte count of a string */
|
||
|
#define SSTR(A) (A+2) /* address of the string */
|
||
|
#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
|
||
|
|
||
|
/*
|
||
|
* bit flags in the descriptor
|
||
|
*/
|
||
|
#define OPT 1
|
||
|
#define STR 2
|
||
|
#define META 4
|
||
|
#define ALT 8
|
||
|
#define OPER 16
|
||
|
|
||
|
char *ure; /* pointer current position in unconverted exp */
|
||
|
char *ccre; /* pointer to current position in converted exp*/
|
||
|
char *malloc();
|
||
|
|
||
|
char *
|
||
|
convexp(re)
|
||
|
char *re; /* unconverted irregular expression */
|
||
|
{
|
||
|
register char *cre; /* pointer to converted regular expression */
|
||
|
|
||
|
/* allocate room for the converted expression */
|
||
|
if (re == NIL)
|
||
|
return (NIL);
|
||
|
if (*re == '\0')
|
||
|
return (NIL);
|
||
|
cre = malloc (4 * strlen(re) + 3);
|
||
|
ccre = cre;
|
||
|
ure = re;
|
||
|
|
||
|
/* start the conversion with a \a */
|
||
|
*cre = META | OPT;
|
||
|
MSYM(cre) = 'a';
|
||
|
ccre = MNEXT(cre);
|
||
|
|
||
|
/* start the conversion (its recursive) */
|
||
|
expconv ();
|
||
|
*ccre = 0;
|
||
|
return (cre);
|
||
|
}
|
||
|
|
||
|
expconv()
|
||
|
{
|
||
|
register char *cs; /* pointer to current symbol in converted exp */
|
||
|
register char c; /* character being processed */
|
||
|
register char *acs; /* pinter to last alternate */
|
||
|
register int temp;
|
||
|
|
||
|
/* let the conversion begin */
|
||
|
acs = NIL;
|
||
|
cs = NIL;
|
||
|
while (*ure != NIL) {
|
||
|
switch (c = *ure++) {
|
||
|
|
||
|
case '\\':
|
||
|
switch (c = *ure++) {
|
||
|
|
||
|
/* escaped characters are just characters */
|
||
|
default:
|
||
|
if (cs == NIL || (*cs & STR) == 0) {
|
||
|
cs = ccre;
|
||
|
*cs = STR;
|
||
|
SCNT(cs) = 1;
|
||
|
ccre += 2;
|
||
|
} else
|
||
|
SCNT(cs)++;
|
||
|
*ccre++ = c;
|
||
|
break;
|
||
|
|
||
|
/* normal(?) metacharacters */
|
||
|
case 'a':
|
||
|
case 'd':
|
||
|
case 'e':
|
||
|
case 'p':
|
||
|
if (acs != NIL && acs != cs) {
|
||
|
do {
|
||
|
temp = OCNT(acs);
|
||
|
OCNT(acs) = ccre - acs;
|
||
|
acs -= temp;
|
||
|
} while (temp != 0);
|
||
|
acs = NIL;
|
||
|
}
|
||
|
cs = ccre;
|
||
|
*cs = META;
|
||
|
MSYM(cs) = c;
|
||
|
ccre = MNEXT(cs);
|
||
|
break;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
/* just put the symbol in */
|
||
|
case '^':
|
||
|
case '$':
|
||
|
if (acs != NIL && acs != cs) {
|
||
|
do {
|
||
|
temp = OCNT(acs);
|
||
|
OCNT(acs) = ccre - acs;
|
||
|
acs -= temp;
|
||
|
} while (temp != 0);
|
||
|
acs = NIL;
|
||
|
}
|
||
|
cs = ccre;
|
||
|
*cs = META;
|
||
|
MSYM(cs) = c;
|
||
|
ccre = MNEXT(cs);
|
||
|
break;
|
||
|
|
||
|
/* mark the last match sequence as optional */
|
||
|
case '?':
|
||
|
if (cs)
|
||
|
*cs = *cs | OPT;
|
||
|
break;
|
||
|
|
||
|
/* recurse and define a subexpression */
|
||
|
case '(':
|
||
|
if (acs != NIL && acs != cs) {
|
||
|
do {
|
||
|
temp = OCNT(acs);
|
||
|
OCNT(acs) = ccre - acs;
|
||
|
acs -= temp;
|
||
|
} while (temp != 0);
|
||
|
acs = NIL;
|
||
|
}
|
||
|
cs = ccre;
|
||
|
*cs = OPER;
|
||
|
OSYM(cs) = '(';
|
||
|
ccre = ONEXT(cs);
|
||
|
expconv ();
|
||
|
OCNT(cs) = ccre - cs; /* offset to next symbol */
|
||
|
break;
|
||
|
|
||
|
/* return from a recursion */
|
||
|
case ')':
|
||
|
if (acs != NIL) {
|
||
|
do {
|
||
|
temp = OCNT(acs);
|
||
|
OCNT(acs) = ccre - acs;
|
||
|
acs -= temp;
|
||
|
} while (temp != 0);
|
||
|
acs = NIL;
|
||
|
}
|
||
|
cs = ccre;
|
||
|
*cs = META;
|
||
|
MSYM(cs) = c;
|
||
|
ccre = MNEXT(cs);
|
||
|
return;
|
||
|
|
||
|
/* mark the last match sequence as having an alternate */
|
||
|
/* the third byte will contain an offset to jump over the */
|
||
|
/* alternate match in case the first did not fail */
|
||
|
case '|':
|
||
|
if (acs != NIL && acs != cs)
|
||
|
OCNT(ccre) = ccre - acs; /* make a back pointer */
|
||
|
else
|
||
|
OCNT(ccre) = 0;
|
||
|
*cs |= ALT;
|
||
|
cs = ccre;
|
||
|
*cs = OPER;
|
||
|
OSYM(cs) = '|';
|
||
|
ccre = ONEXT(cs);
|
||
|
acs = cs; /* remember that the pointer is to be filles */
|
||
|
break;
|
||
|
|
||
|
/* if its not a metasymbol just build a scharacter string */
|
||
|
default:
|
||
|
if (cs == NIL || (*cs & STR) == 0) {
|
||
|
cs = ccre;
|
||
|
*cs = STR;
|
||
|
SCNT(cs) = 1;
|
||
|
ccre = SSTR(cs);
|
||
|
} else
|
||
|
SCNT(cs)++;
|
||
|
*ccre++ = c;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (acs != NIL) {
|
||
|
do {
|
||
|
temp = OCNT(acs);
|
||
|
OCNT(acs) = ccre - acs;
|
||
|
acs -= temp;
|
||
|
} while (temp != 0);
|
||
|
acs = NIL;
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
/* end of convertre */
|
||
|
|
||
|
|
||
|
/*
|
||
|
* The following routine recognises an irregular expresion
|
||
|
* with the following special characters:
|
||
|
*
|
||
|
* \? - means last match was optional
|
||
|
* \a - matches any number of characters
|
||
|
* \d - matches any number of spaces and tabs
|
||
|
* \p - matches any number of alphanumeric
|
||
|
* characters. The
|
||
|
* characters matched will be copied into
|
||
|
* the area pointed to by 'name'.
|
||
|
* \| - alternation
|
||
|
* \( \) - grouping used mostly for alternation and
|
||
|
* optionality
|
||
|
*
|
||
|
* The irregular expression must be translated to internal form
|
||
|
* prior to calling this routine
|
||
|
*
|
||
|
* The value returned is the pointer to the first non \a
|
||
|
* character matched.
|
||
|
*/
|
||
|
|
||
|
boolean _escaped; /* true if we are currently _escaped */
|
||
|
char *_start; /* start of string */
|
||
|
|
||
|
char *
|
||
|
expmatch (s, re, mstring)
|
||
|
register char *s; /* string to check for a match in */
|
||
|
register char *re; /* a converted irregular expression */
|
||
|
register char *mstring; /* where to put whatever matches a \p */
|
||
|
{
|
||
|
register char *cs; /* the current symbol */
|
||
|
register char *ptr,*s1; /* temporary pointer */
|
||
|
boolean matched; /* a temporary boolean */
|
||
|
|
||
|
/* initial conditions */
|
||
|
if (re == NIL)
|
||
|
return (NIL);
|
||
|
cs = re;
|
||
|
matched = FALSE;
|
||
|
|
||
|
/* loop till expression string is exhausted (or at least pretty tired) */
|
||
|
while (*cs) {
|
||
|
switch (*cs & (OPER | STR | META)) {
|
||
|
|
||
|
/* try to match a string */
|
||
|
case STR:
|
||
|
matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
|
||
|
if (matched) {
|
||
|
|
||
|
/* hoorah it matches */
|
||
|
s += SCNT(cs);
|
||
|
cs = SNEXT(cs);
|
||
|
} else if (*cs & ALT) {
|
||
|
|
||
|
/* alternation, skip to next expression */
|
||
|
cs = SNEXT(cs);
|
||
|
} else if (*cs & OPT) {
|
||
|
|
||
|
/* the match is optional */
|
||
|
cs = SNEXT(cs);
|
||
|
matched = 1; /* indicate a successful match */
|
||
|
} else {
|
||
|
|
||
|
/* no match, error return */
|
||
|
return (NIL);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
/* an operator, do something fancy */
|
||
|
case OPER:
|
||
|
switch (OSYM(cs)) {
|
||
|
|
||
|
/* this is an alternation */
|
||
|
case '|':
|
||
|
if (matched)
|
||
|
|
||
|
/* last thing in the alternation was a match, skip ahead */
|
||
|
cs = OPTR(cs);
|
||
|
else
|
||
|
|
||
|
/* no match, keep trying */
|
||
|
cs = ONEXT(cs);
|
||
|
break;
|
||
|
|
||
|
/* this is a grouping, recurse */
|
||
|
case '(':
|
||
|
ptr = expmatch (s, ONEXT(cs), mstring);
|
||
|
if (ptr != NIL) {
|
||
|
|
||
|
/* the subexpression matched */
|
||
|
matched = 1;
|
||
|
s = ptr;
|
||
|
} else if (*cs & ALT) {
|
||
|
|
||
|
/* alternation, skip to next expression */
|
||
|
matched = 0;
|
||
|
} else if (*cs & OPT) {
|
||
|
|
||
|
/* the match is optional */
|
||
|
matched = 1; /* indicate a successful match */
|
||
|
} else {
|
||
|
|
||
|
/* no match, error return */
|
||
|
return (NIL);
|
||
|
}
|
||
|
cs = OPTR(cs);
|
||
|
break;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
/* try to match a metasymbol */
|
||
|
case META:
|
||
|
switch (MSYM(cs)) {
|
||
|
|
||
|
/* try to match anything and remember what was matched */
|
||
|
case 'p':
|
||
|
/*
|
||
|
* This is really the same as trying the match the
|
||
|
* remaining parts of the expression to any subset
|
||
|
* of the string.
|
||
|
*/
|
||
|
s1 = s;
|
||
|
do {
|
||
|
ptr = expmatch (s1, MNEXT(cs), mstring);
|
||
|
if (ptr != NIL && s1 != s) {
|
||
|
|
||
|
/* we have a match, remember the match */
|
||
|
strncpy (mstring, s, s1 - s);
|
||
|
mstring[s1 - s] = '\0';
|
||
|
return (ptr);
|
||
|
} else if (ptr != NIL && (*cs & OPT)) {
|
||
|
|
||
|
/* it was aoptional so no match is ok */
|
||
|
return (ptr);
|
||
|
} else if (ptr != NIL) {
|
||
|
|
||
|
/* not optional and we still matched */
|
||
|
return (NIL);
|
||
|
}
|
||
|
if (!isalnum(*s1) && *s1 != '_')
|
||
|
return (NIL);
|
||
|
if (*s1 == '\\')
|
||
|
_escaped = _escaped ? FALSE : TRUE;
|
||
|
else
|
||
|
_escaped = FALSE;
|
||
|
} while (*s1++);
|
||
|
return (NIL);
|
||
|
|
||
|
/* try to match anything */
|
||
|
case 'a':
|
||
|
/*
|
||
|
* This is really the same as trying the match the
|
||
|
* remaining parts of the expression to any subset
|
||
|
* of the string.
|
||
|
*/
|
||
|
s1 = s;
|
||
|
do {
|
||
|
ptr = expmatch (s1, MNEXT(cs), mstring);
|
||
|
if (ptr != NIL && s1 != s) {
|
||
|
|
||
|
/* we have a match */
|
||
|
return (ptr);
|
||
|
} else if (ptr != NIL && (*cs & OPT)) {
|
||
|
|
||
|
/* it was aoptional so no match is ok */
|
||
|
return (ptr);
|
||
|
} else if (ptr != NIL) {
|
||
|
|
||
|
/* not optional and we still matched */
|
||
|
return (NIL);
|
||
|
}
|
||
|
if (*s1 == '\\')
|
||
|
_escaped = _escaped ? FALSE : TRUE;
|
||
|
else
|
||
|
_escaped = FALSE;
|
||
|
} while (*s1++);
|
||
|
return (NIL);
|
||
|
|
||
|
/* fail if we are currently _escaped */
|
||
|
case 'e':
|
||
|
if (_escaped)
|
||
|
return(NIL);
|
||
|
cs = MNEXT(cs);
|
||
|
break;
|
||
|
|
||
|
/* match any number of tabs and spaces */
|
||
|
case 'd':
|
||
|
ptr = s;
|
||
|
while (*s == ' ' || *s == '\t')
|
||
|
s++;
|
||
|
if (s != ptr || s == _start) {
|
||
|
|
||
|
/* match, be happy */
|
||
|
matched = 1;
|
||
|
cs = MNEXT(cs);
|
||
|
} else if (*s == '\n' || *s == '\0') {
|
||
|
|
||
|
/* match, be happy */
|
||
|
matched = 1;
|
||
|
cs = MNEXT(cs);
|
||
|
} else if (*cs & ALT) {
|
||
|
|
||
|
/* try the next part */
|
||
|
matched = 0;
|
||
|
cs = MNEXT(cs);
|
||
|
} else if (*cs & OPT) {
|
||
|
|
||
|
/* doesn't matter */
|
||
|
matched = 1;
|
||
|
cs = MNEXT(cs);
|
||
|
} else
|
||
|
|
||
|
/* no match, error return */
|
||
|
return (NIL);
|
||
|
break;
|
||
|
|
||
|
/* check for end of line */
|
||
|
case '$':
|
||
|
if (*s == '\0' || *s == '\n') {
|
||
|
|
||
|
/* match, be happy */
|
||
|
s++;
|
||
|
matched = 1;
|
||
|
cs = MNEXT(cs);
|
||
|
} else if (*cs & ALT) {
|
||
|
|
||
|
/* try the next part */
|
||
|
matched = 0;
|
||
|
cs = MNEXT(cs);
|
||
|
} else if (*cs & OPT) {
|
||
|
|
||
|
/* doesn't matter */
|
||
|
matched = 1;
|
||
|
cs = MNEXT(cs);
|
||
|
} else
|
||
|
|
||
|
/* no match, error return */
|
||
|
return (NIL);
|
||
|
break;
|
||
|
|
||
|
/* check for start of line */
|
||
|
case '^':
|
||
|
if (s == _start) {
|
||
|
|
||
|
/* match, be happy */
|
||
|
matched = 1;
|
||
|
cs = MNEXT(cs);
|
||
|
} else if (*cs & ALT) {
|
||
|
|
||
|
/* try the next part */
|
||
|
matched = 0;
|
||
|
cs = MNEXT(cs);
|
||
|
} else if (*cs & OPT) {
|
||
|
|
||
|
/* doesn't matter */
|
||
|
matched = 1;
|
||
|
cs = MNEXT(cs);
|
||
|
} else
|
||
|
|
||
|
/* no match, error return */
|
||
|
return (NIL);
|
||
|
break;
|
||
|
|
||
|
/* end of a subexpression, return success */
|
||
|
case ')':
|
||
|
return (s);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return (s);
|
||
|
}
|