602 lines
14 KiB
C
602 lines
14 KiB
C
/* $NetBSD: regexp.c,v 1.14 2019/12/27 09:32:10 msaitoh Exp $ */
|
|
|
|
/*
|
|
* Copyright (c) 1980, 1993
|
|
* The Regents of the University of California. All rights reserved.
|
|
*
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
#if HAVE_NBTOOL_CONFIG_H
|
|
#include "nbtool_config.h"
|
|
#endif
|
|
|
|
#include <sys/cdefs.h>
|
|
#ifndef lint
|
|
__COPYRIGHT("@(#) Copyright (c) 1980, 1993\
|
|
The Regents of the University of California. All rights reserved.");
|
|
#endif /* not lint */
|
|
|
|
#ifndef lint
|
|
#if 0
|
|
static char sccsid[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
|
|
#endif
|
|
__RCSID("$NetBSD: regexp.c,v 1.14 2019/12/27 09:32:10 msaitoh Exp $");
|
|
#endif /* not lint */
|
|
|
|
#include <assert.h>
|
|
#include <ctype.h>
|
|
#include <stdlib.h>
|
|
#include <stdbool.h>
|
|
#include <string.h>
|
|
#include "extern.h"
|
|
|
|
static void expconv (void);
|
|
|
|
bool x_escaped; /* true if we are currently x_escaped */
|
|
char *x_start; /* start of string */
|
|
bool l_onecase; /* true if upper and lower equivalent */
|
|
|
|
#define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
|
|
|
|
static char
|
|
tocc(ptrdiff_t x) {
|
|
if (x & ~0xff)
|
|
abort();
|
|
return (char)x;
|
|
}
|
|
|
|
/* STRNCMP - like strncmp except that we convert the
|
|
* first string to lower case before comparing
|
|
* if l_onecase is set.
|
|
*/
|
|
|
|
int
|
|
STRNCMP(char *s1, char *s2, int len)
|
|
{
|
|
if (l_onecase) {
|
|
do
|
|
if (*s2 - makelower(*s1))
|
|
return *s2 - makelower(*s1);
|
|
else {
|
|
s2++;
|
|
s1++;
|
|
}
|
|
while (--len);
|
|
} else {
|
|
do
|
|
if (*s2 - *s1)
|
|
return *s2 - *s1;
|
|
else {
|
|
s2++;
|
|
s1++;
|
|
}
|
|
while (--len);
|
|
}
|
|
return(0);
|
|
}
|
|
|
|
/* The following routine converts an irregular expression to
|
|
* internal format.
|
|
*
|
|
* Either meta symbols (\a \d or \p) or character strings or
|
|
* operations ( alternation or perenthesizing ) can be
|
|
* specified. Each starts with a descriptor byte. The descriptor
|
|
* byte has STR set for strings, META set for meta symbols
|
|
* and OPER set for operations.
|
|
* The descriptor byte can also have the OPT bit set if the object
|
|
* defined is optional. Also ALT can be set to indicate an alternation.
|
|
*
|
|
* For metasymbols the byte following the descriptor byte identities
|
|
* the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
|
|
* strings the byte after the descriptor is a character count for
|
|
* the string:
|
|
*
|
|
* meta symbols := descriptor
|
|
* symbol
|
|
*
|
|
* strings := descriptor
|
|
* character count
|
|
* the string
|
|
*
|
|
* operatins := descriptor
|
|
* symbol
|
|
* character count
|
|
*/
|
|
|
|
/*
|
|
* handy macros for accessing parts of match blocks
|
|
*/
|
|
#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
|
|
#define MNEXT(A) (A+2) /* character following a metasymbol block */
|
|
|
|
#define OSYM(A) (*(A+1)) /* symbol in an operation block */
|
|
#define OCNT(A) (*(A+2)) /* character count */
|
|
#define ONEXT(A) (A+3) /* next character after the operation */
|
|
#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
|
|
|
|
#define SCNT(A) (*(A+1)) /* byte count of a string */
|
|
#define SSTR(A) (A+2) /* address of the string */
|
|
#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
|
|
|
|
/*
|
|
* bit flags in the descriptor
|
|
*/
|
|
#define OPT 1
|
|
#define STR 2
|
|
#define META 4
|
|
#define ALT 8
|
|
#define OPER 16
|
|
|
|
static char *ccre; /* pointer to current position in converted exp*/
|
|
static char *ure; /* pointer current position in unconverted exp */
|
|
|
|
char *
|
|
convexp(char *re) /* unconverted irregular expression */
|
|
{
|
|
char *cre; /* pointer to converted regular expression */
|
|
|
|
/* allocate room for the converted expression */
|
|
if (re == NULL)
|
|
return NULL;
|
|
if (*re == '\0')
|
|
return NULL;
|
|
cre = malloc(4 * strlen(re) + 3);
|
|
ccre = cre;
|
|
ure = re;
|
|
|
|
/* start the conversion with a \a */
|
|
*cre = META | OPT;
|
|
MSYM(cre) = 'a';
|
|
ccre = MNEXT(cre);
|
|
|
|
/* start the conversion (its recursive) */
|
|
expconv();
|
|
*ccre = 0;
|
|
return cre;
|
|
}
|
|
|
|
static void
|
|
expconv(void)
|
|
{
|
|
char *cs; /* pointer to current symbol in converted exp */
|
|
char c; /* character being processed */
|
|
char *acs; /* pinter to last alternate */
|
|
int temp;
|
|
|
|
/* let the conversion begin */
|
|
acs = NULL;
|
|
cs = NULL;
|
|
while (*ure) {
|
|
switch (c = *ure++) {
|
|
|
|
case '\\':
|
|
switch (c = *ure++) {
|
|
|
|
/* escaped characters are just characters */
|
|
default:
|
|
if (cs == NULL || (*cs & STR) == 0) {
|
|
cs = ccre;
|
|
*cs = STR;
|
|
SCNT(cs) = 1;
|
|
ccre += 2;
|
|
} else
|
|
SCNT(cs)++;
|
|
*ccre++ = c;
|
|
break;
|
|
|
|
/* normal(?) metacharacters */
|
|
case 'a':
|
|
case 'd':
|
|
case 'e':
|
|
case 'p':
|
|
if (acs != NULL && acs != cs) {
|
|
do {
|
|
temp = OCNT(acs);
|
|
OCNT(acs) = tocc(ccre - acs);
|
|
acs -= temp;
|
|
} while (temp != 0);
|
|
acs = NULL;
|
|
}
|
|
cs = ccre;
|
|
*cs = META;
|
|
MSYM(cs) = c;
|
|
ccre = MNEXT(cs);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
/* just put the symbol in */
|
|
case '^':
|
|
case '$':
|
|
if (acs != NULL && acs != cs) {
|
|
do {
|
|
temp = OCNT(acs);
|
|
OCNT(acs) = tocc(ccre - acs);
|
|
acs -= temp;
|
|
} while (temp != 0);
|
|
acs = NULL;
|
|
}
|
|
cs = ccre;
|
|
*cs = META;
|
|
MSYM(cs) = c;
|
|
ccre = MNEXT(cs);
|
|
break;
|
|
|
|
/* mark the last match sequence as optional */
|
|
case '?':
|
|
if (cs)
|
|
*cs = *cs | OPT;
|
|
break;
|
|
|
|
/* recurse and define a subexpression */
|
|
case '(':
|
|
if (acs != NULL && acs != cs) {
|
|
do {
|
|
temp = OCNT(acs);
|
|
OCNT(acs) = tocc(ccre - acs);
|
|
acs -= temp;
|
|
} while (temp != 0);
|
|
acs = NULL;
|
|
}
|
|
cs = ccre;
|
|
*cs = OPER;
|
|
OSYM(cs) = '(';
|
|
ccre = ONEXT(cs);
|
|
expconv();
|
|
OCNT(cs) = tocc(ccre - cs); /* offset to next symbol */
|
|
break;
|
|
|
|
/* reurn from a recursion */
|
|
case ')':
|
|
if (acs != NULL) {
|
|
do {
|
|
temp = OCNT(acs);
|
|
OCNT(acs) = tocc(ccre - acs);
|
|
acs -= temp;
|
|
} while (temp != 0);
|
|
acs = NULL;
|
|
}
|
|
cs = ccre;
|
|
*cs = META;
|
|
MSYM(cs) = c;
|
|
ccre = MNEXT(cs);
|
|
return;
|
|
|
|
/* mark the last match sequence as having an alternate */
|
|
/* the third byte will contain an offset to jump over the */
|
|
/* alternate match in case the first did not fail */
|
|
case '|':
|
|
if (acs != NULL && acs != cs)
|
|
OCNT(ccre) = tocc(ccre - acs); /* make a back pointer */
|
|
else
|
|
OCNT(ccre) = 0;
|
|
assert(cs != NULL);
|
|
*cs |= ALT;
|
|
cs = ccre;
|
|
*cs = OPER;
|
|
OSYM(cs) = '|';
|
|
ccre = ONEXT(cs);
|
|
acs = cs; /* remember that the pointer is to be filles */
|
|
break;
|
|
|
|
/* if its not a metasymbol just build a scharacter string */
|
|
default:
|
|
if (cs == NULL || (*cs & STR) == 0) {
|
|
cs = ccre;
|
|
*cs = STR;
|
|
SCNT(cs) = 1;
|
|
ccre = SSTR(cs);
|
|
} else
|
|
SCNT(cs)++;
|
|
*ccre++ = c;
|
|
break;
|
|
}
|
|
}
|
|
if (acs != NULL) {
|
|
do {
|
|
temp = OCNT(acs);
|
|
OCNT(acs) = tocc(ccre - acs);
|
|
acs -= temp;
|
|
} while (temp != 0);
|
|
acs = NULL;
|
|
}
|
|
return;
|
|
}
|
|
/* end of convertre */
|
|
|
|
|
|
/*
|
|
* The following routine recognises an irregular expression
|
|
* with the following special characters:
|
|
*
|
|
* \? - means last match was optional
|
|
* \a - matches any number of characters
|
|
* \d - matches any number of spaces and tabs
|
|
* \p - matches any number of alphanumeric
|
|
* characters. The
|
|
* characters matched will be copied into
|
|
* the area pointed to by 'name'.
|
|
* \| - alternation
|
|
* \( \) - grouping used mostly for alternation and
|
|
* optionality
|
|
*
|
|
* The irregular expression must be translated to internal form
|
|
* prior to calling this routine
|
|
*
|
|
* The value returned is the pointer to the first non \a
|
|
* character matched.
|
|
*/
|
|
|
|
char *
|
|
expmatch(
|
|
char *s, /* string to check for a match in */
|
|
char *re, /* a converted irregular expression */
|
|
char *mstring) /* where to put whatever matches a \p */
|
|
{
|
|
char *cs; /* the current symbol */
|
|
char *ptr,*s1; /* temporary pointer */
|
|
bool matched; /* a temporary bool */
|
|
|
|
/* initial conditions */
|
|
if (re == NULL)
|
|
return NULL;
|
|
cs = re;
|
|
matched = false;
|
|
|
|
/* loop till expression string is exhausted (or at least pretty tired) */
|
|
while (*cs) {
|
|
switch (*cs & (OPER | STR | META)) {
|
|
|
|
/* try to match a string */
|
|
case STR:
|
|
matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
|
|
if (matched) {
|
|
|
|
/* hoorah it matches */
|
|
s += SCNT(cs);
|
|
cs = SNEXT(cs);
|
|
} else if (*cs & ALT) {
|
|
|
|
/* alternation, skip to next expression */
|
|
cs = SNEXT(cs);
|
|
} else if (*cs & OPT) {
|
|
|
|
/* the match is optional */
|
|
cs = SNEXT(cs);
|
|
matched = 1; /* indicate a successful match */
|
|
} else {
|
|
|
|
/* no match, error return */
|
|
return NULL;
|
|
}
|
|
break;
|
|
|
|
/* an operator, do something fancy */
|
|
case OPER:
|
|
switch (OSYM(cs)) {
|
|
|
|
/* this is an alternation */
|
|
case '|':
|
|
if (matched)
|
|
|
|
/* last thing in the alternation was a match, skip ahead */
|
|
cs = OPTR(cs);
|
|
else
|
|
|
|
/* no match, keep trying */
|
|
cs = ONEXT(cs);
|
|
break;
|
|
|
|
/* this is a grouping, recurse */
|
|
case '(':
|
|
ptr = expmatch(s, ONEXT(cs), mstring);
|
|
if (ptr != NULL) {
|
|
|
|
/* the subexpression matched */
|
|
matched = 1;
|
|
s = ptr;
|
|
} else if (*cs & ALT) {
|
|
|
|
/* alternation, skip to next expression */
|
|
matched = 0;
|
|
} else if (*cs & OPT) {
|
|
|
|
/* the match is optional */
|
|
matched = 1; /* indicate a successful match */
|
|
} else {
|
|
|
|
/* no match, error return */
|
|
return NULL;
|
|
}
|
|
cs = OPTR(cs);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
/* try to match a metasymbol */
|
|
case META:
|
|
switch (MSYM(cs)) {
|
|
|
|
/* try to match anything and remember what was matched */
|
|
case 'p':
|
|
/*
|
|
* This is really the same as trying the match the
|
|
* remaining parts of the expression to any subset
|
|
* of the string.
|
|
*/
|
|
s1 = s;
|
|
do {
|
|
ptr = expmatch(s1, MNEXT(cs), mstring);
|
|
if (ptr != NULL && s1 != s) {
|
|
|
|
/* we have a match, remember the match */
|
|
strncpy(mstring, s, (size_t)(s1 - s));
|
|
mstring[s1 - s] = '\0';
|
|
return ptr;
|
|
} else if (ptr != NULL && (*cs & OPT)) {
|
|
|
|
/* it was aoptional so no match is ok */
|
|
return ptr;
|
|
} else if (ptr != NULL) {
|
|
|
|
/* not optional and we still matched */
|
|
return NULL;
|
|
}
|
|
if (!isalnum((unsigned char)*s1) && *s1 != '_')
|
|
return NULL;
|
|
if (*s1 == '\\')
|
|
x_escaped = x_escaped ? false : true;
|
|
else
|
|
x_escaped = false;
|
|
} while (*s1++);
|
|
return NULL;
|
|
|
|
/* try to match anything */
|
|
case 'a':
|
|
/*
|
|
* This is really the same as trying the match the
|
|
* remaining parts of the expression to any subset
|
|
* of the string.
|
|
*/
|
|
s1 = s;
|
|
do {
|
|
ptr = expmatch(s1, MNEXT(cs), mstring);
|
|
if (ptr != NULL && s1 != s) {
|
|
|
|
/* we have a match */
|
|
return ptr;
|
|
} else if (ptr != NULL && (*cs & OPT)) {
|
|
|
|
/* it was aoptional so no match is ok */
|
|
return ptr;
|
|
} else if (ptr != NULL) {
|
|
|
|
/* not optional and we still matched */
|
|
return NULL;
|
|
}
|
|
if (*s1 == '\\')
|
|
x_escaped = x_escaped ? false : true;
|
|
else
|
|
x_escaped = false;
|
|
} while (*s1++);
|
|
return NULL;
|
|
|
|
/* fail if we are currently x_escaped */
|
|
case 'e':
|
|
if (x_escaped)
|
|
return NULL;
|
|
cs = MNEXT(cs);
|
|
break;
|
|
|
|
/* match any number of tabs and spaces */
|
|
case 'd':
|
|
ptr = s;
|
|
while (*s == ' ' || *s == '\t')
|
|
s++;
|
|
if (s != ptr || s == x_start) {
|
|
|
|
/* match, be happy */
|
|
matched = 1;
|
|
cs = MNEXT(cs);
|
|
} else if (*s == '\n' || *s == '\0') {
|
|
|
|
/* match, be happy */
|
|
matched = 1;
|
|
cs = MNEXT(cs);
|
|
} else if (*cs & ALT) {
|
|
|
|
/* try the next part */
|
|
matched = 0;
|
|
cs = MNEXT(cs);
|
|
} else if (*cs & OPT) {
|
|
|
|
/* doesn't matter */
|
|
matched = 1;
|
|
cs = MNEXT(cs);
|
|
} else
|
|
|
|
/* no match, error return */
|
|
return NULL;
|
|
break;
|
|
|
|
/* check for end of line */
|
|
case '$':
|
|
if (*s == '\0' || *s == '\n') {
|
|
|
|
/* match, be happy */
|
|
s++;
|
|
matched = 1;
|
|
cs = MNEXT(cs);
|
|
} else if (*cs & ALT) {
|
|
|
|
/* try the next part */
|
|
matched = 0;
|
|
cs = MNEXT(cs);
|
|
} else if (*cs & OPT) {
|
|
|
|
/* doesn't matter */
|
|
matched = 1;
|
|
cs = MNEXT(cs);
|
|
} else
|
|
|
|
/* no match, error return */
|
|
return NULL;
|
|
break;
|
|
|
|
/* check for start of line */
|
|
case '^':
|
|
if (s == x_start) {
|
|
|
|
/* match, be happy */
|
|
matched = 1;
|
|
cs = MNEXT(cs);
|
|
} else if (*cs & ALT) {
|
|
|
|
/* try the next part */
|
|
matched = 0;
|
|
cs = MNEXT(cs);
|
|
} else if (*cs & OPT) {
|
|
|
|
/* doesn't matter */
|
|
matched = 1;
|
|
cs = MNEXT(cs);
|
|
} else
|
|
|
|
/* no match, error return */
|
|
return NULL;
|
|
break;
|
|
|
|
/* end of a subexpression, return success */
|
|
case ')':
|
|
return s;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return s;
|
|
}
|