462 lines
11 KiB
C
462 lines
11 KiB
C
|
|
/********************************************
|
|
rexp0.c
|
|
copyright 1991, Michael D. Brennan
|
|
|
|
This is a source file for mawk, an implementation of
|
|
the AWK programming language.
|
|
|
|
Mawk is distributed without warranty under the terms of
|
|
the GNU General Public License, version 2, 1991.
|
|
********************************************/
|
|
|
|
/*$Log: rexp0.c,v $
|
|
/*Revision 1.1.1.1 1993/03/21 09:45:37 cgd
|
|
/*initial import of 386bsd-0.1 sources
|
|
/*
|
|
* Revision 3.6 92/01/21 17:32:51 brennan
|
|
* added some casts so that character classes work with signed chars
|
|
*
|
|
* Revision 3.5 91/10/29 10:53:57 brennan
|
|
* SIZE_T
|
|
*
|
|
* Revision 3.4 91/08/13 09:10:05 brennan
|
|
* VERSION .9994
|
|
*
|
|
* Revision 3.3 91/07/19 07:29:24 brennan
|
|
* backslash at end of regular expression now stands for backslash
|
|
*
|
|
* Revision 3.2 91/07/19 06:58:23 brennan
|
|
* removed small bozo in processing of escape characters
|
|
*
|
|
* Revision 3.1 91/06/07 10:33:20 brennan
|
|
* VERSION 0.995
|
|
*
|
|
* Revision 1.2 91/06/05 08:59:36 brennan
|
|
* changed RE_free to free
|
|
*
|
|
* Revision 1.1 91/06/03 07:10:15 brennan
|
|
* Initial revision
|
|
*
|
|
*/
|
|
|
|
/* lexical scanner */
|
|
|
|
#include "rexp.h"
|
|
|
|
/* static functions */
|
|
static int PROTO( do_str, (int, char **, MACHINE *) ) ;
|
|
static int PROTO( do_class, (char **, MACHINE *) ) ;
|
|
static int PROTO( escape, (char **) ) ;
|
|
static BV *PROTO( store_bvp, (BV *) ) ;
|
|
static int PROTO( ctohex, (int) ) ;
|
|
|
|
|
|
#ifndef EG /* if EG make next array visible */
|
|
static
|
|
#endif
|
|
char RE_char2token[ '|' + 1 ] = {
|
|
0,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
|
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,9,13,13,13,
|
|
6,7,3,4,13,13,10,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
|
13,13,5,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
|
13,13,13,13,13,13,13,13,13,13,11,12,13,8,13,13,13,13,13,13,
|
|
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
|
|
13,13,13,13,1} ;
|
|
|
|
#define char2token(x) ( (unsigned char)(x) > '|' ? T_CHAR : RE_char2token[x] )
|
|
|
|
#define NOT_STARTED (-1)
|
|
|
|
static int prev ;
|
|
static char *lp ; /* ptr to reg exp string */
|
|
static unsigned re_len ;
|
|
|
|
|
|
void RE_lex_init( re )
|
|
char *re ;
|
|
{
|
|
lp = re ;
|
|
re_len = strlen(re) + 1 ;
|
|
prev = NOT_STARTED ;
|
|
RE_run_stack_init() ;
|
|
}
|
|
|
|
|
|
int RE_lex( mp )
|
|
MACHINE *mp ;
|
|
{ register int c ;
|
|
|
|
switch( c = char2token(*lp) )
|
|
{
|
|
case T_OR :
|
|
case T_PLUS :
|
|
case T_STAR :
|
|
case T_Q :
|
|
case T_RP :
|
|
lp++ ; return prev = c ;
|
|
case T_SLASH :
|
|
break ;
|
|
|
|
case 0 : return 0 ;
|
|
|
|
case T_LP :
|
|
switch( prev )
|
|
{
|
|
case T_CHAR :
|
|
case T_STR :
|
|
case T_ANY :
|
|
case T_CLASS :
|
|
case T_START :
|
|
case T_RP :
|
|
case T_PLUS :
|
|
case T_STAR :
|
|
case T_Q :
|
|
case T_U :
|
|
return prev = T_CAT ;
|
|
default :
|
|
lp++ ;
|
|
return prev = T_LP ;
|
|
}
|
|
}
|
|
|
|
/* *lp is an operand, but implicit cat op is possible */
|
|
switch( prev )
|
|
{ case NOT_STARTED :
|
|
case T_OR :
|
|
case T_LP :
|
|
case T_CAT :
|
|
|
|
switch( c )
|
|
{ case T_ANY :
|
|
{ static plus_is_star_flag = 0 ;
|
|
|
|
if ( * ++lp == '*' )
|
|
{ lp++ ; *mp = RE_u() ;
|
|
return prev = T_U ; }
|
|
else
|
|
if ( *lp == '+' )
|
|
if ( plus_is_star_flag )
|
|
{ lp++ ; *mp = RE_u() ;
|
|
plus_is_star_flag = 0 ;
|
|
return prev = T_U ;
|
|
}
|
|
else
|
|
{ plus_is_star_flag = 1 ;
|
|
lp-- ; *mp = RE_any() ;
|
|
return prev = T_ANY ;
|
|
}
|
|
else
|
|
{ *mp = RE_any() ;
|
|
prev = T_ANY ;
|
|
}
|
|
}
|
|
break ;
|
|
|
|
case T_SLASH :
|
|
lp++ ; c = escape(&lp) ;
|
|
prev = do_str(c, &lp, mp) ;
|
|
break ;
|
|
|
|
case T_CHAR :
|
|
c = *lp++ ;
|
|
prev = do_str(c, &lp, mp) ;
|
|
break ;
|
|
|
|
case T_CLASS : prev = do_class(&lp, mp) ;
|
|
break ;
|
|
|
|
case T_START : *mp = RE_start() ; lp++ ;
|
|
prev = T_START ;
|
|
break ;
|
|
|
|
case T_END :
|
|
lp++ ; *mp = RE_end() ;
|
|
return prev = T_END ;
|
|
|
|
default :
|
|
RE_panic("bad switch in RE_lex") ;
|
|
}
|
|
break ;
|
|
|
|
default : /* don't advance the pointer, return T_CAT */
|
|
return prev = T_CAT ;
|
|
}
|
|
/* check for end character */
|
|
if ( *lp == '$' )
|
|
{ mp->start->type += END_ON ; lp++ ; }
|
|
return prev ;
|
|
}
|
|
|
|
static int do_str( c, pp, mp)
|
|
int c ; /* the first character */
|
|
char **pp ; /* where to put the re_char pointer on exit */
|
|
MACHINE *mp ; /* where to put the string machine */
|
|
{ register char *p , *s ;
|
|
char *str ;
|
|
unsigned len ;
|
|
|
|
|
|
p = *pp ;
|
|
s = str = RE_malloc( re_len ) ;
|
|
*s++ = c ; len = 1 ;
|
|
|
|
while ( 1 )
|
|
{ char *save ;
|
|
|
|
switch( char2token(*p) )
|
|
{
|
|
case T_CHAR : *s++ = *p++ ;
|
|
break ;
|
|
case T_SLASH :
|
|
save = ++p ;
|
|
*s++ = escape(&save) ;
|
|
p = save ;
|
|
break ;
|
|
|
|
default : goto out ;
|
|
}
|
|
len++ ;
|
|
}
|
|
out:
|
|
/* if len > 1 and we failed on a ? + or * , need to back up */
|
|
if ( len > 1 && (*p == '*' || *p == '+' || *p == '?' ) )
|
|
{ len-- ; p-- ; s-- ; }
|
|
|
|
*s = 0 ;
|
|
*pp = p ;
|
|
*mp = RE_str((char *) RE_realloc(str, len+1) , len) ;
|
|
return T_STR ;
|
|
}
|
|
|
|
|
|
/*--------------------------------------------
|
|
BUILD A CHARACTER CLASS
|
|
*---------------------------*/
|
|
|
|
#define on( b, x) ( (b)[((unsigned char)(x))>>3] |= ( 1 << ((x)&7) ))
|
|
|
|
static void PROTO(block_on, (BV,int,int) ) ;
|
|
|
|
static void block_on( b, x, y)
|
|
BV b ; int x, y ;
|
|
{ int lo = (x&0xff) >> 3 ;
|
|
int hi = (y&0xff) >> 3 ;
|
|
int i, j, bit ;
|
|
|
|
if ( lo == hi )
|
|
{ j = x&7 ; bit = 1 << j ; i = (y&7) - j + 1 ;
|
|
for ( ; i ; i-- , bit <<= 1 ) b[lo] |= bit ; }
|
|
else
|
|
{ for ( i = lo + 1 ; i <= hi - 1 ; i++ ) b[i] = 0xff ;
|
|
b[lo] |= ( 0xff << (x&7) ) ;
|
|
b[hi] |= ~( 0xff << ((y&7)+1)) ;
|
|
}
|
|
}
|
|
|
|
/* build a BV for a character class.
|
|
*start points at the '['
|
|
on exit: *start points at the character after ']'
|
|
mp points at a machine that recognizes the class
|
|
*/
|
|
|
|
static int do_class( start, mp)
|
|
char **start ; MACHINE *mp ;
|
|
{ register char *p ;
|
|
register BV *bvp ;
|
|
int prev ;
|
|
char *q , *t;
|
|
int cnt ;
|
|
int comp_flag ;
|
|
|
|
p = (*start) + 1 ;
|
|
if ( *p == ']' || *p == '^' && *(p+1) == ']' )
|
|
RE_error_trap(-E3) ;
|
|
while ( 1 ) /* find the back of the class */
|
|
{ if ( ! (q = strchr(p,']')) ) /* no closing bracket */
|
|
RE_error_trap(-E3) ;
|
|
p = q-1 ;
|
|
cnt = 0 ;
|
|
while ( *p == '\\') { cnt++ ; p-- ; }
|
|
if ( (cnt & 1) == 0 ) /* even number of \ */ break ;
|
|
p = q+1 ;
|
|
}
|
|
/* q now pts at the back of the class */
|
|
p = (*start) + 1 ;
|
|
*start = q + 1 ;
|
|
|
|
bvp = (BV *) RE_malloc( sizeof(BV) ) ;
|
|
(void) memset( bvp, 0, SIZE_T(sizeof(BV)) ) ;
|
|
|
|
comp_flag = *p == '^' ? (p++ , 1) : 0 ;
|
|
prev = -1 ; /* indicates - cannot be part of a range */
|
|
|
|
while ( p < q )
|
|
{
|
|
switch( *p )
|
|
{ case '\\' :
|
|
t = ++p ;
|
|
prev = escape(&t) ;
|
|
on(*bvp, prev) ;
|
|
p = t ;
|
|
continue ;
|
|
|
|
case '-' :
|
|
if ( prev == -1 || p+1 == q || prev > *(unsigned char*)(p+1) )
|
|
{ prev = '-' ; on(*bvp, '-') ; }
|
|
else
|
|
{ p++ ;
|
|
block_on(*bvp, prev, *p) ;
|
|
prev = -1 ;
|
|
}
|
|
break ;
|
|
|
|
default :
|
|
prev = *(unsigned char*)p ;
|
|
on(*bvp, *p) ;
|
|
break ;
|
|
}
|
|
p++ ;
|
|
}
|
|
|
|
if ( comp_flag )
|
|
for ( p = (char *) bvp ; p < (char *) bvp + sizeof(BV) ; p++) *p = ~*p ;
|
|
|
|
/* make sure zero is off */
|
|
(*bvp)[0] &= 0xfe ;
|
|
|
|
*mp = RE_class( store_bvp( bvp ) ) ;
|
|
return T_CLASS ;
|
|
}
|
|
|
|
|
|
/* storage for bit vectors so they can be reused ,
|
|
stored in an unsorted linear array
|
|
the array grows as needed
|
|
*/
|
|
|
|
#define BV_GROWTH 6
|
|
|
|
static BV *store_bvp( bvp )
|
|
BV *bvp ;
|
|
{
|
|
static BV **bv_base, **bv_limit ;
|
|
static BV **bv_next ; /* next empty slot in the array */
|
|
|
|
register BV **p ;
|
|
unsigned t ;
|
|
|
|
|
|
if ( bv_next == bv_limit ) /* need to grow */
|
|
{
|
|
if ( ! bv_base ) /* first growth */
|
|
{ t = 0 ; bv_base = (BV**)RE_malloc(BV_GROWTH*sizeof(BV*)) ; }
|
|
else
|
|
{ t = bv_next - bv_base ;
|
|
bv_base = (BV**) RE_realloc(bv_base, (t+BV_GROWTH)*sizeof(BV*)) ;
|
|
}
|
|
|
|
bv_next = bv_base + t ;
|
|
bv_limit = bv_next + BV_GROWTH ;
|
|
}
|
|
|
|
/* put bvp in bv_next as a sentinal */
|
|
*bv_next = bvp ;
|
|
p = bv_base ;
|
|
while ( memcmp(*p, bvp, SIZE_T(sizeof(BV))) ) p++ ;
|
|
|
|
if ( p == bv_next ) /* it is new */
|
|
bv_next++ ;
|
|
else /* we already have it */ free(bvp) ;
|
|
|
|
return *p ;
|
|
}
|
|
|
|
|
|
/* ---------- convert escape sequences -------------*/
|
|
|
|
#define isoctal(x) ((x)>='0'&&(x)<='7')
|
|
|
|
#define NOT_HEX 16
|
|
static char hex_val['f' - 'A' + 1] = {
|
|
10,11,12,13,14,15, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
10,11,12,13,14,15 } ;
|
|
|
|
/* interpret 1 character as hex */
|
|
static int ctohex( c )
|
|
register int c ;
|
|
{ int t ;
|
|
|
|
if ( c >= '0' && c <= '9' ) return c - '0' ;
|
|
|
|
if ( c >= 'A' && c <= 'f' && ( t = hex_val[c-'A'] )) return t ;
|
|
|
|
return NOT_HEX ;
|
|
}
|
|
|
|
#define ET_END 7
|
|
|
|
static struct { char in , out ; } escape_test[ET_END+1] = {
|
|
'n' , '\n',
|
|
't' , '\t',
|
|
'f' , '\f',
|
|
'b' , '\b',
|
|
'r' , '\r',
|
|
'a' , '\07',
|
|
'v' , '\013',
|
|
0 , 0 } ;
|
|
|
|
|
|
/*-----------------
|
|
return the char
|
|
and move the pointer forward
|
|
on entry *s -> at the character after the slash
|
|
*-------------------*/
|
|
|
|
static int escape(start_p)
|
|
char **start_p ;
|
|
{ register char *p = *start_p ;
|
|
register unsigned x ;
|
|
unsigned xx ;
|
|
int i ;
|
|
|
|
|
|
escape_test[ET_END].in = *p ;
|
|
i = 0 ;
|
|
while ( escape_test[i].in != *p ) i++ ;
|
|
if ( i != ET_END ) /* in table */
|
|
{ *start_p = p + 1 ;
|
|
return escape_test[i].out ;
|
|
}
|
|
|
|
if ( isoctal(*p) )
|
|
{ x = *p++ - '0' ;
|
|
if ( isoctal(*p) )
|
|
{ x = (x<<3) + *p++ - '0' ;
|
|
if ( isoctal(*p) )
|
|
x = (x<<3) + *p++ - '0' ;
|
|
}
|
|
*start_p = p ;
|
|
return x & 0xff ;
|
|
}
|
|
|
|
if ( *p == 0 ) return '\\' ;
|
|
|
|
if ( *p++ == 'x' ) /* might be a hex digit */
|
|
{ if ( (x = ctohex(*p)) == NOT_HEX )
|
|
{ *start_p = p ; return 'x' ; }
|
|
|
|
/* look for another hex digit */
|
|
if ( (xx = ctohex(* ++p)) != NOT_HEX )
|
|
{ x = (x<<4) + xx ; p++ ; }
|
|
|
|
*start_p = p ; return x ;
|
|
}
|
|
/* anything else \c -> c */
|
|
*start_p = p ;
|
|
return p[-1] ;
|
|
}
|