NetBSD/gnu/usr.bin/awk/rexp/rexp0.c

462 lines
11 KiB
C

/********************************************
rexp0.c
copyright 1991, Michael D. Brennan
This is a source file for mawk, an implementation of
the AWK programming language.
Mawk is distributed without warranty under the terms of
the GNU General Public License, version 2, 1991.
********************************************/
/*$Log: rexp0.c,v $
/*Revision 1.1.1.1 1993/03/21 09:45:37 cgd
/*initial import of 386bsd-0.1 sources
/*
* Revision 3.6 92/01/21 17:32:51 brennan
* added some casts so that character classes work with signed chars
*
* Revision 3.5 91/10/29 10:53:57 brennan
* SIZE_T
*
* Revision 3.4 91/08/13 09:10:05 brennan
* VERSION .9994
*
* Revision 3.3 91/07/19 07:29:24 brennan
* backslash at end of regular expression now stands for backslash
*
* Revision 3.2 91/07/19 06:58:23 brennan
* removed small bozo in processing of escape characters
*
* Revision 3.1 91/06/07 10:33:20 brennan
* VERSION 0.995
*
* Revision 1.2 91/06/05 08:59:36 brennan
* changed RE_free to free
*
* Revision 1.1 91/06/03 07:10:15 brennan
* Initial revision
*
*/
/* lexical scanner */
#include "rexp.h"
/* static functions */
static int PROTO( do_str, (int, char **, MACHINE *) ) ;
static int PROTO( do_class, (char **, MACHINE *) ) ;
static int PROTO( escape, (char **) ) ;
static BV *PROTO( store_bvp, (BV *) ) ;
static int PROTO( ctohex, (int) ) ;
#ifndef EG /* if EG make next array visible */
static
#endif
char RE_char2token[ '|' + 1 ] = {
0,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,9,13,13,13,
6,7,3,4,13,13,10,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
13,13,5,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
13,13,13,13,13,13,13,13,13,13,11,12,13,8,13,13,13,13,13,13,
13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
13,13,13,13,1} ;
#define char2token(x) ( (unsigned char)(x) > '|' ? T_CHAR : RE_char2token[x] )
#define NOT_STARTED (-1)
static int prev ;
static char *lp ; /* ptr to reg exp string */
static unsigned re_len ;
void RE_lex_init( re )
char *re ;
{
lp = re ;
re_len = strlen(re) + 1 ;
prev = NOT_STARTED ;
RE_run_stack_init() ;
}
int RE_lex( mp )
MACHINE *mp ;
{ register int c ;
switch( c = char2token(*lp) )
{
case T_OR :
case T_PLUS :
case T_STAR :
case T_Q :
case T_RP :
lp++ ; return prev = c ;
case T_SLASH :
break ;
case 0 : return 0 ;
case T_LP :
switch( prev )
{
case T_CHAR :
case T_STR :
case T_ANY :
case T_CLASS :
case T_START :
case T_RP :
case T_PLUS :
case T_STAR :
case T_Q :
case T_U :
return prev = T_CAT ;
default :
lp++ ;
return prev = T_LP ;
}
}
/* *lp is an operand, but implicit cat op is possible */
switch( prev )
{ case NOT_STARTED :
case T_OR :
case T_LP :
case T_CAT :
switch( c )
{ case T_ANY :
{ static plus_is_star_flag = 0 ;
if ( * ++lp == '*' )
{ lp++ ; *mp = RE_u() ;
return prev = T_U ; }
else
if ( *lp == '+' )
if ( plus_is_star_flag )
{ lp++ ; *mp = RE_u() ;
plus_is_star_flag = 0 ;
return prev = T_U ;
}
else
{ plus_is_star_flag = 1 ;
lp-- ; *mp = RE_any() ;
return prev = T_ANY ;
}
else
{ *mp = RE_any() ;
prev = T_ANY ;
}
}
break ;
case T_SLASH :
lp++ ; c = escape(&lp) ;
prev = do_str(c, &lp, mp) ;
break ;
case T_CHAR :
c = *lp++ ;
prev = do_str(c, &lp, mp) ;
break ;
case T_CLASS : prev = do_class(&lp, mp) ;
break ;
case T_START : *mp = RE_start() ; lp++ ;
prev = T_START ;
break ;
case T_END :
lp++ ; *mp = RE_end() ;
return prev = T_END ;
default :
RE_panic("bad switch in RE_lex") ;
}
break ;
default : /* don't advance the pointer, return T_CAT */
return prev = T_CAT ;
}
/* check for end character */
if ( *lp == '$' )
{ mp->start->type += END_ON ; lp++ ; }
return prev ;
}
static int do_str( c, pp, mp)
int c ; /* the first character */
char **pp ; /* where to put the re_char pointer on exit */
MACHINE *mp ; /* where to put the string machine */
{ register char *p , *s ;
char *str ;
unsigned len ;
p = *pp ;
s = str = RE_malloc( re_len ) ;
*s++ = c ; len = 1 ;
while ( 1 )
{ char *save ;
switch( char2token(*p) )
{
case T_CHAR : *s++ = *p++ ;
break ;
case T_SLASH :
save = ++p ;
*s++ = escape(&save) ;
p = save ;
break ;
default : goto out ;
}
len++ ;
}
out:
/* if len > 1 and we failed on a ? + or * , need to back up */
if ( len > 1 && (*p == '*' || *p == '+' || *p == '?' ) )
{ len-- ; p-- ; s-- ; }
*s = 0 ;
*pp = p ;
*mp = RE_str((char *) RE_realloc(str, len+1) , len) ;
return T_STR ;
}
/*--------------------------------------------
BUILD A CHARACTER CLASS
*---------------------------*/
#define on( b, x) ( (b)[((unsigned char)(x))>>3] |= ( 1 << ((x)&7) ))
static void PROTO(block_on, (BV,int,int) ) ;
static void block_on( b, x, y)
BV b ; int x, y ;
{ int lo = (x&0xff) >> 3 ;
int hi = (y&0xff) >> 3 ;
int i, j, bit ;
if ( lo == hi )
{ j = x&7 ; bit = 1 << j ; i = (y&7) - j + 1 ;
for ( ; i ; i-- , bit <<= 1 ) b[lo] |= bit ; }
else
{ for ( i = lo + 1 ; i <= hi - 1 ; i++ ) b[i] = 0xff ;
b[lo] |= ( 0xff << (x&7) ) ;
b[hi] |= ~( 0xff << ((y&7)+1)) ;
}
}
/* build a BV for a character class.
*start points at the '['
on exit: *start points at the character after ']'
mp points at a machine that recognizes the class
*/
static int do_class( start, mp)
char **start ; MACHINE *mp ;
{ register char *p ;
register BV *bvp ;
int prev ;
char *q , *t;
int cnt ;
int comp_flag ;
p = (*start) + 1 ;
if ( *p == ']' || *p == '^' && *(p+1) == ']' )
RE_error_trap(-E3) ;
while ( 1 ) /* find the back of the class */
{ if ( ! (q = strchr(p,']')) ) /* no closing bracket */
RE_error_trap(-E3) ;
p = q-1 ;
cnt = 0 ;
while ( *p == '\\') { cnt++ ; p-- ; }
if ( (cnt & 1) == 0 ) /* even number of \ */ break ;
p = q+1 ;
}
/* q now pts at the back of the class */
p = (*start) + 1 ;
*start = q + 1 ;
bvp = (BV *) RE_malloc( sizeof(BV) ) ;
(void) memset( bvp, 0, SIZE_T(sizeof(BV)) ) ;
comp_flag = *p == '^' ? (p++ , 1) : 0 ;
prev = -1 ; /* indicates - cannot be part of a range */
while ( p < q )
{
switch( *p )
{ case '\\' :
t = ++p ;
prev = escape(&t) ;
on(*bvp, prev) ;
p = t ;
continue ;
case '-' :
if ( prev == -1 || p+1 == q || prev > *(unsigned char*)(p+1) )
{ prev = '-' ; on(*bvp, '-') ; }
else
{ p++ ;
block_on(*bvp, prev, *p) ;
prev = -1 ;
}
break ;
default :
prev = *(unsigned char*)p ;
on(*bvp, *p) ;
break ;
}
p++ ;
}
if ( comp_flag )
for ( p = (char *) bvp ; p < (char *) bvp + sizeof(BV) ; p++) *p = ~*p ;
/* make sure zero is off */
(*bvp)[0] &= 0xfe ;
*mp = RE_class( store_bvp( bvp ) ) ;
return T_CLASS ;
}
/* storage for bit vectors so they can be reused ,
stored in an unsorted linear array
the array grows as needed
*/
#define BV_GROWTH 6
static BV *store_bvp( bvp )
BV *bvp ;
{
static BV **bv_base, **bv_limit ;
static BV **bv_next ; /* next empty slot in the array */
register BV **p ;
unsigned t ;
if ( bv_next == bv_limit ) /* need to grow */
{
if ( ! bv_base ) /* first growth */
{ t = 0 ; bv_base = (BV**)RE_malloc(BV_GROWTH*sizeof(BV*)) ; }
else
{ t = bv_next - bv_base ;
bv_base = (BV**) RE_realloc(bv_base, (t+BV_GROWTH)*sizeof(BV*)) ;
}
bv_next = bv_base + t ;
bv_limit = bv_next + BV_GROWTH ;
}
/* put bvp in bv_next as a sentinal */
*bv_next = bvp ;
p = bv_base ;
while ( memcmp(*p, bvp, SIZE_T(sizeof(BV))) ) p++ ;
if ( p == bv_next ) /* it is new */
bv_next++ ;
else /* we already have it */ free(bvp) ;
return *p ;
}
/* ---------- convert escape sequences -------------*/
#define isoctal(x) ((x)>='0'&&(x)<='7')
#define NOT_HEX 16
static char hex_val['f' - 'A' + 1] = {
10,11,12,13,14,15, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
10,11,12,13,14,15 } ;
/* interpret 1 character as hex */
static int ctohex( c )
register int c ;
{ int t ;
if ( c >= '0' && c <= '9' ) return c - '0' ;
if ( c >= 'A' && c <= 'f' && ( t = hex_val[c-'A'] )) return t ;
return NOT_HEX ;
}
#define ET_END 7
static struct { char in , out ; } escape_test[ET_END+1] = {
'n' , '\n',
't' , '\t',
'f' , '\f',
'b' , '\b',
'r' , '\r',
'a' , '\07',
'v' , '\013',
0 , 0 } ;
/*-----------------
return the char
and move the pointer forward
on entry *s -> at the character after the slash
*-------------------*/
static int escape(start_p)
char **start_p ;
{ register char *p = *start_p ;
register unsigned x ;
unsigned xx ;
int i ;
escape_test[ET_END].in = *p ;
i = 0 ;
while ( escape_test[i].in != *p ) i++ ;
if ( i != ET_END ) /* in table */
{ *start_p = p + 1 ;
return escape_test[i].out ;
}
if ( isoctal(*p) )
{ x = *p++ - '0' ;
if ( isoctal(*p) )
{ x = (x<<3) + *p++ - '0' ;
if ( isoctal(*p) )
x = (x<<3) + *p++ - '0' ;
}
*start_p = p ;
return x & 0xff ;
}
if ( *p == 0 ) return '\\' ;
if ( *p++ == 'x' ) /* might be a hex digit */
{ if ( (x = ctohex(*p)) == NOT_HEX )
{ *start_p = p ; return 'x' ; }
/* look for another hex digit */
if ( (xx = ctohex(* ++p)) != NOT_HEX )
{ x = (x<<4) + xx ; p++ ; }
*start_p = p ; return x ;
}
/* anything else \c -> c */
*start_p = p ;
return p[-1] ;
}