NetBSD/usr.bin/vgrind/regexp.c

602 lines
14 KiB
C
Raw Normal View History

2014-07-15 17:17:15 +04:00
/* $NetBSD: regexp.c,v 1.13 2014/07/15 13:17:15 christos Exp $ */
1993-04-09 16:58:42 +04:00
/*
* Copyright (c) 1980, 1993
* The Regents of the University of California. All rights reserved.
*
1993-04-09 16:58:42 +04:00
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1993-04-09 16:58:42 +04:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
2014-07-15 17:17:15 +04:00
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
1993-04-09 16:58:42 +04:00
#ifndef lint
__COPYRIGHT("@(#) Copyright (c) 1980, 1993\
The Regents of the University of California. All rights reserved.");
#endif /* not lint */
#ifndef lint
#if 0
static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
#endif
2014-07-15 17:17:15 +04:00
__RCSID("$NetBSD: regexp.c,v 1.13 2014/07/15 13:17:15 christos Exp $");
1993-04-09 16:58:42 +04:00
#endif /* not lint */
2006-05-01 09:14:22 +04:00
#include <assert.h>
1993-04-09 16:58:42 +04:00
#include <ctype.h>
#include <stdlib.h>
2014-07-15 17:17:15 +04:00
#include <stdbool.h>
#include <string.h>
#include "extern.h"
1993-04-09 16:58:42 +04:00
2014-07-15 17:17:15 +04:00
static void expconv (void);
2014-07-15 17:17:15 +04:00
bool x_escaped; /* true if we are currently x_escaped */
char *x_start; /* start of string */
2014-07-15 17:17:15 +04:00
bool l_onecase; /* true if upper and lower equivalent */
1993-04-09 16:58:42 +04:00
#define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
1993-04-09 16:58:42 +04:00
2014-07-15 17:17:15 +04:00
static char
tocc(ptrdiff_t x) {
if (x & ~0xff)
abort();
return (char)x;
}
1993-04-09 16:58:42 +04:00
/* STRNCMP - like strncmp except that we convert the
* first string to lower case before comparing
* if l_onecase is set.
*/
int
2012-03-21 00:34:57 +04:00
STRNCMP(char *s1, char *s2, int len)
1993-04-09 16:58:42 +04:00
{
if (l_onecase) {
do
if (*s2 - makelower(*s1))
2014-07-15 17:17:15 +04:00
return *s2 - makelower(*s1);
1993-04-09 16:58:42 +04:00
else {
s2++;
s1++;
}
while (--len);
} else {
do
if (*s2 - *s1)
2014-07-15 17:17:15 +04:00
return *s2 - *s1;
1993-04-09 16:58:42 +04:00
else {
s2++;
s1++;
}
while (--len);
}
return(0);
}
/* The following routine converts an irregular expression to
* internal format.
*
* Either meta symbols (\a \d or \p) or character strings or
* operations ( alternation or perenthesizing ) can be
* specified. Each starts with a descriptor byte. The descriptor
* byte has STR set for strings, META set for meta symbols
* and OPER set for operations.
* The descriptor byte can also have the OPT bit set if the object
* defined is optional. Also ALT can be set to indicate an alternation.
*
* For metasymbols the byte following the descriptor byte identities
* the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
* strings the byte after the descriptor is a character count for
* the string:
*
* meta symbols := descriptor
* symbol
*
* strings := descriptor
* character count
* the string
*
* operatins := descriptor
* symbol
* character count
*/
/*
* handy macros for accessing parts of match blocks
*/
#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
#define MNEXT(A) (A+2) /* character following a metasymbol block */
#define OSYM(A) (*(A+1)) /* symbol in an operation block */
#define OCNT(A) (*(A+2)) /* character count */
#define ONEXT(A) (A+3) /* next character after the operation */
#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
#define SCNT(A) (*(A+1)) /* byte count of a string */
#define SSTR(A) (A+2) /* address of the string */
#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
/*
* bit flags in the descriptor
*/
#define OPT 1
#define STR 2
#define META 4
#define ALT 8
#define OPER 16
static char *ccre; /* pointer to current position in converted exp*/
static char *ure; /* pointer current position in unconverted exp */
1993-04-09 16:58:42 +04:00
char *
2012-03-21 00:34:57 +04:00
convexp(char *re) /* unconverted irregular expression */
1993-04-09 16:58:42 +04:00
{
char *cre; /* pointer to converted regular expression */
1993-04-09 16:58:42 +04:00
/* allocate room for the converted expression */
2014-07-15 17:17:15 +04:00
if (re == NULL)
return NULL;
1993-04-09 16:58:42 +04:00
if (*re == '\0')
2014-07-15 17:17:15 +04:00
return NULL;
2003-07-12 17:37:15 +04:00
cre = malloc(4 * strlen(re) + 3);
1993-04-09 16:58:42 +04:00
ccre = cre;
ure = re;
/* start the conversion with a \a */
*cre = META | OPT;
MSYM(cre) = 'a';
ccre = MNEXT(cre);
/* start the conversion (its recursive) */
2003-07-12 17:37:15 +04:00
expconv();
1993-04-09 16:58:42 +04:00
*ccre = 0;
2014-07-15 17:17:15 +04:00
return cre;
1993-04-09 16:58:42 +04:00
}
static void
2012-03-21 00:34:57 +04:00
expconv(void)
1993-04-09 16:58:42 +04:00
{
char *cs; /* pointer to current symbol in converted exp */
char c; /* character being processed */
char *acs; /* pinter to last alternate */
int temp;
1993-04-09 16:58:42 +04:00
/* let the conversion begin */
2014-07-15 17:17:15 +04:00
acs = NULL;
cs = NULL;
while (*ure) {
1993-04-09 16:58:42 +04:00
switch (c = *ure++) {
case '\\':
switch (c = *ure++) {
/* escaped characters are just characters */
default:
2014-07-15 17:17:15 +04:00
if (cs == NULL || (*cs & STR) == 0) {
1993-04-09 16:58:42 +04:00
cs = ccre;
*cs = STR;
SCNT(cs) = 1;
ccre += 2;
} else
SCNT(cs)++;
*ccre++ = c;
break;
/* normal(?) metacharacters */
case 'a':
case 'd':
case 'e':
case 'p':
2014-07-15 17:17:15 +04:00
if (acs != NULL && acs != cs) {
1993-04-09 16:58:42 +04:00
do {
temp = OCNT(acs);
2014-07-15 17:17:15 +04:00
OCNT(acs) = tocc(ccre - acs);
1993-04-09 16:58:42 +04:00
acs -= temp;
} while (temp != 0);
2014-07-15 17:17:15 +04:00
acs = NULL;
1993-04-09 16:58:42 +04:00
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
break;
}
break;
/* just put the symbol in */
case '^':
case '$':
2014-07-15 17:17:15 +04:00
if (acs != NULL && acs != cs) {
1993-04-09 16:58:42 +04:00
do {
temp = OCNT(acs);
2014-07-15 17:17:15 +04:00
OCNT(acs) = tocc(ccre - acs);
1993-04-09 16:58:42 +04:00
acs -= temp;
} while (temp != 0);
2014-07-15 17:17:15 +04:00
acs = NULL;
1993-04-09 16:58:42 +04:00
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
break;
/* mark the last match sequence as optional */
case '?':
if (cs)
*cs = *cs | OPT;
break;
/* recurse and define a subexpression */
case '(':
2014-07-15 17:17:15 +04:00
if (acs != NULL && acs != cs) {
1993-04-09 16:58:42 +04:00
do {
temp = OCNT(acs);
2014-07-15 17:17:15 +04:00
OCNT(acs) = tocc(ccre - acs);
1993-04-09 16:58:42 +04:00
acs -= temp;
} while (temp != 0);
2014-07-15 17:17:15 +04:00
acs = NULL;
1993-04-09 16:58:42 +04:00
}
cs = ccre;
*cs = OPER;
OSYM(cs) = '(';
ccre = ONEXT(cs);
2003-07-12 17:37:15 +04:00
expconv();
2014-07-15 17:17:15 +04:00
OCNT(cs) = tocc(ccre - cs); /* offset to next symbol */
1993-04-09 16:58:42 +04:00
break;
/* reurn from a recursion */
1993-04-09 16:58:42 +04:00
case ')':
2014-07-15 17:17:15 +04:00
if (acs != NULL) {
1993-04-09 16:58:42 +04:00
do {
temp = OCNT(acs);
2014-07-15 17:17:15 +04:00
OCNT(acs) = tocc(ccre - acs);
1993-04-09 16:58:42 +04:00
acs -= temp;
} while (temp != 0);
2014-07-15 17:17:15 +04:00
acs = NULL;
1993-04-09 16:58:42 +04:00
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
return;
/* mark the last match sequence as having an alternate */
/* the third byte will contain an offset to jump over the */
/* alternate match in case the first did not fail */
case '|':
2014-07-15 17:17:15 +04:00
if (acs != NULL && acs != cs)
OCNT(ccre) = tocc(ccre - acs); /* make a back pointer */
1993-04-09 16:58:42 +04:00
else
OCNT(ccre) = 0;
assert(cs != NULL);
1993-04-09 16:58:42 +04:00
*cs |= ALT;
cs = ccre;
*cs = OPER;
OSYM(cs) = '|';
ccre = ONEXT(cs);
acs = cs; /* remember that the pointer is to be filles */
break;
/* if its not a metasymbol just build a scharacter string */
default:
2014-07-15 17:17:15 +04:00
if (cs == NULL || (*cs & STR) == 0) {
1993-04-09 16:58:42 +04:00
cs = ccre;
*cs = STR;
SCNT(cs) = 1;
ccre = SSTR(cs);
} else
SCNT(cs)++;
*ccre++ = c;
break;
}
}
2014-07-15 17:17:15 +04:00
if (acs != NULL) {
1993-04-09 16:58:42 +04:00
do {
temp = OCNT(acs);
2014-07-15 17:17:15 +04:00
OCNT(acs) = tocc(ccre - acs);
1993-04-09 16:58:42 +04:00
acs -= temp;
} while (temp != 0);
2014-07-15 17:17:15 +04:00
acs = NULL;
1993-04-09 16:58:42 +04:00
}
return;
}
/* end of convertre */
/*
* The following routine recognises an irregular expresion
* with the following special characters:
*
* \? - means last match was optional
* \a - matches any number of characters
* \d - matches any number of spaces and tabs
* \p - matches any number of alphanumeric
* characters. The
* characters matched will be copied into
* the area pointed to by 'name'.
* \| - alternation
* \( \) - grouping used mostly for alternation and
* optionality
*
* The irregular expression must be translated to internal form
* prior to calling this routine
*
* The value returned is the pointer to the first non \a
* character matched.
*/
char *
2012-03-21 00:34:57 +04:00
expmatch(
char *s, /* string to check for a match in */
char *re, /* a converted irregular expression */
char *mstring) /* where to put whatever matches a \p */
1993-04-09 16:58:42 +04:00
{
char *cs; /* the current symbol */
char *ptr,*s1; /* temporary pointer */
2014-07-15 17:17:15 +04:00
bool matched; /* a temporary bool */
1993-04-09 16:58:42 +04:00
/* initial conditions */
2014-07-15 17:17:15 +04:00
if (re == NULL)
return NULL;
1993-04-09 16:58:42 +04:00
cs = re;
2014-07-15 17:17:15 +04:00
matched = false;
1993-04-09 16:58:42 +04:00
/* loop till expression string is exhausted (or at least pretty tired) */
while (*cs) {
switch (*cs & (OPER | STR | META)) {
/* try to match a string */
case STR:
matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
if (matched) {
/* hoorah it matches */
s += SCNT(cs);
cs = SNEXT(cs);
} else if (*cs & ALT) {
/* alternation, skip to next expression */
cs = SNEXT(cs);
} else if (*cs & OPT) {
/* the match is optional */
cs = SNEXT(cs);
matched = 1; /* indicate a successful match */
} else {
/* no match, error return */
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
}
break;
/* an operator, do something fancy */
case OPER:
switch (OSYM(cs)) {
/* this is an alternation */
case '|':
if (matched)
/* last thing in the alternation was a match, skip ahead */
cs = OPTR(cs);
else
/* no match, keep trying */
cs = ONEXT(cs);
break;
/* this is a grouping, recurse */
case '(':
2003-07-12 17:37:15 +04:00
ptr = expmatch(s, ONEXT(cs), mstring);
2014-07-15 17:17:15 +04:00
if (ptr != NULL) {
1993-04-09 16:58:42 +04:00
/* the subexpression matched */
matched = 1;
s = ptr;
} else if (*cs & ALT) {
/* alternation, skip to next expression */
matched = 0;
} else if (*cs & OPT) {
/* the match is optional */
matched = 1; /* indicate a successful match */
} else {
/* no match, error return */
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
}
cs = OPTR(cs);
break;
}
break;
/* try to match a metasymbol */
case META:
switch (MSYM(cs)) {
/* try to match anything and remember what was matched */
case 'p':
/*
* This is really the same as trying the match the
* remaining parts of the expression to any subset
* of the string.
*/
s1 = s;
do {
2003-07-12 17:37:15 +04:00
ptr = expmatch(s1, MNEXT(cs), mstring);
2014-07-15 17:17:15 +04:00
if (ptr != NULL && s1 != s) {
1993-04-09 16:58:42 +04:00
/* we have a match, remember the match */
2014-07-15 17:17:15 +04:00
strncpy(mstring, s, (size_t)(s1 - s));
1993-04-09 16:58:42 +04:00
mstring[s1 - s] = '\0';
2014-07-15 17:17:15 +04:00
return ptr;
} else if (ptr != NULL && (*cs & OPT)) {
1993-04-09 16:58:42 +04:00
/* it was aoptional so no match is ok */
2014-07-15 17:17:15 +04:00
return ptr;
} else if (ptr != NULL) {
1993-04-09 16:58:42 +04:00
/* not optional and we still matched */
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
}
if (!isalnum((unsigned char)*s1) && *s1 != '_')
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
if (*s1 == '\\')
2014-07-15 17:17:15 +04:00
x_escaped = x_escaped ? false : true;
1993-04-09 16:58:42 +04:00
else
2014-07-15 17:17:15 +04:00
x_escaped = false;
1993-04-09 16:58:42 +04:00
} while (*s1++);
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
/* try to match anything */
case 'a':
/*
* This is really the same as trying the match the
* remaining parts of the expression to any subset
* of the string.
*/
s1 = s;
do {
2003-07-12 17:37:15 +04:00
ptr = expmatch(s1, MNEXT(cs), mstring);
2014-07-15 17:17:15 +04:00
if (ptr != NULL && s1 != s) {
1993-04-09 16:58:42 +04:00
/* we have a match */
2014-07-15 17:17:15 +04:00
return ptr;
} else if (ptr != NULL && (*cs & OPT)) {
1993-04-09 16:58:42 +04:00
/* it was aoptional so no match is ok */
2014-07-15 17:17:15 +04:00
return ptr;
} else if (ptr != NULL) {
1993-04-09 16:58:42 +04:00
/* not optional and we still matched */
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
}
if (*s1 == '\\')
2014-07-15 17:17:15 +04:00
x_escaped = x_escaped ? false : true;
1993-04-09 16:58:42 +04:00
else
2014-07-15 17:17:15 +04:00
x_escaped = false;
1993-04-09 16:58:42 +04:00
} while (*s1++);
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
/* fail if we are currently x_escaped */
1993-04-09 16:58:42 +04:00
case 'e':
if (x_escaped)
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
cs = MNEXT(cs);
break;
/* match any number of tabs and spaces */
case 'd':
ptr = s;
while (*s == ' ' || *s == '\t')
s++;
if (s != ptr || s == x_start) {
1993-04-09 16:58:42 +04:00
/* match, be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*s == '\n' || *s == '\0') {
/* match, be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match, error return */
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
break;
/* check for end of line */
case '$':
if (*s == '\0' || *s == '\n') {
/* match, be happy */
s++;
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match, error return */
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
break;
/* check for start of line */
case '^':
if (s == x_start) {
1993-04-09 16:58:42 +04:00
/* match, be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match, error return */
2014-07-15 17:17:15 +04:00
return NULL;
1993-04-09 16:58:42 +04:00
break;
/* end of a subexpression, return success */
case ')':
2014-07-15 17:17:15 +04:00
return s;
1993-04-09 16:58:42 +04:00
}
break;
}
}
2014-07-15 17:17:15 +04:00
return s;
1993-04-09 16:58:42 +04:00
}