187 lines
5.0 KiB
C
187 lines
5.0 KiB
C
|
|
/********************************************
|
|
rexp.c
|
|
copyright 1991, Michael D. Brennan
|
|
|
|
This is a source file for mawk, an implementation of
|
|
the AWK programming language.
|
|
|
|
Mawk is distributed without warranty under the terms of
|
|
the GNU General Public License, version 2, 1991.
|
|
********************************************/
|
|
|
|
/*$Log: rexp.c,v $
|
|
/*Revision 1.1.1.1 1993/03/21 09:45:37 cgd
|
|
/*initial import of 386bsd-0.1 sources
|
|
/*
|
|
* Revision 3.4 91/08/13 09:09:59 brennan
|
|
* VERSION .9994
|
|
*
|
|
* Revision 3.3 91/08/04 15:45:03 brennan
|
|
* no longer attempt to recover mem on failed REcompile
|
|
* Its not worth the effort
|
|
*
|
|
* Revision 3.2 91/08/03 07:24:06 brennan
|
|
* check for empty machine stack (missing operand) wasn't quite right
|
|
*
|
|
* Revision 3.1 91/06/07 10:33:16 brennan
|
|
* VERSION 0.995
|
|
*
|
|
* Revision 1.7 91/06/05 08:58:47 brennan
|
|
* change RE_free to free
|
|
*
|
|
* Revision 1.6 91/06/03 07:07:17 brennan
|
|
* moved parser stacks inside REcompile
|
|
* removed unnecessary copying
|
|
*
|
|
*/
|
|
|
|
/* op precedence parser for regular expressions */
|
|
|
|
#include "rexp.h"
|
|
|
|
|
|
/* DATA */
|
|
int REerrno ;
|
|
char *REerrlist[] = { (char *) 0 ,
|
|
/* 1 */ "missing '('",
|
|
/* 2 */ "missing ')'",
|
|
/* 3 */ "bad class -- [], [^] or [" ,
|
|
/* 4 */ "missing operand" ,
|
|
/* 5 */ "resource exhaustion -- regular expression too large"
|
|
} ;
|
|
/* E5 is very unlikely to occur */
|
|
|
|
/* This table drives the operator precedence parser */
|
|
static short table[8][8] = {
|
|
|
|
/* 0 | CAT * + ? ( ) */
|
|
/* 0 */ 0, L, L, L, L, L, L, E1,
|
|
/* | */ G, G, L, L, L, L, L, G,
|
|
/* CAT*/ G, G, G, L, L, L, L, G,
|
|
/* * */ G, G, G, G, G, G, E7, G,
|
|
/* + */ G, G, G, G, G, G, E7, G,
|
|
/* ? */ G, G, G, G, G, G, E7, G,
|
|
/* ( */ E2, L, L, L, L, L, L, EQ,
|
|
/* ) */ G , G, G, G, G, G, E7, G } ;
|
|
|
|
|
|
#define STACKSZ 64
|
|
|
|
|
|
static jmp_buf err_buf ; /* used to trap on error */
|
|
|
|
void RE_error_trap(x) /* return is dummy to make macro OK */
|
|
int x ;
|
|
{
|
|
REerrno = x ;
|
|
longjmp(err_buf, 1 ) ;
|
|
}
|
|
|
|
|
|
VOID *REcompile(re)
|
|
char *re ;
|
|
{
|
|
MACHINE m_stack[STACKSZ] ;
|
|
struct op {
|
|
int token ;
|
|
int prec ;
|
|
} op_stack[STACKSZ] ;
|
|
register MACHINE *m_ptr ;
|
|
register struct op *op_ptr ;
|
|
register int t ;
|
|
|
|
/* do this first because it also checks if we have a
|
|
run time stack */
|
|
RE_lex_init(re) ;
|
|
|
|
if ( *re == 0 )
|
|
{ STATE *p = (STATE *) RE_malloc( sizeof(STATE) ) ;
|
|
p->type = M_ACCEPT ;
|
|
return (VOID *) p ;
|
|
}
|
|
|
|
if ( setjmp(err_buf) ) return (VOID *) 0 ;
|
|
/* we used to try to recover memory left on machine stack ;
|
|
but now m_ptr is in a register so it won't be right unless
|
|
we force it out of a register which isn't worth the trouble */
|
|
|
|
/* initialize the stacks */
|
|
m_ptr = m_stack - 1 ;
|
|
op_ptr = op_stack ;
|
|
op_ptr->token = 0 ;
|
|
|
|
t = RE_lex(m_stack) ;
|
|
|
|
while( 1 )
|
|
{ switch( t )
|
|
{
|
|
case T_STR :
|
|
case T_ANY :
|
|
case T_U :
|
|
case T_START :
|
|
case T_END :
|
|
case T_CLASS : m_ptr++ ; break ;
|
|
|
|
case 0 : /* end of reg expr */
|
|
if ( op_ptr -> token == 0 ) /* done */
|
|
if ( m_ptr == m_stack ) return (VOID *)m_ptr->start ;
|
|
else
|
|
/* machines still on the stack */
|
|
RE_panic("values still on machine stack") ;
|
|
|
|
/* otherwise fall thru to default
|
|
which is operator case */
|
|
|
|
default:
|
|
|
|
if ( (op_ptr -> prec = table[op_ptr -> token][t]) == G )
|
|
{
|
|
do
|
|
{ /* op_pop */
|
|
|
|
if ( op_ptr->token <= T_CAT ) /*binary op*/ m_ptr-- ;
|
|
/* if not enough values on machine stack
|
|
then we have a missing operand */
|
|
if ( m_ptr < m_stack ) RE_error_trap(-E4) ;
|
|
|
|
switch( op_ptr->token )
|
|
{ case T_CAT : RE_cat(m_ptr, m_ptr+1) ; break ;
|
|
case T_OR : RE_or( m_ptr, m_ptr+1) ; break ;
|
|
case T_STAR : RE_close( m_ptr) ; break ;
|
|
case T_PLUS : RE_poscl( m_ptr ) ; break ;
|
|
case T_Q : RE_01( m_ptr ) ; break ;
|
|
default : break ; /*nothing on ( or ) */
|
|
}
|
|
|
|
op_ptr-- ;
|
|
}
|
|
while ( op_ptr->prec != L ) ;
|
|
|
|
continue ; /* back thru switch at top */
|
|
}
|
|
|
|
if ( op_ptr -> prec < 0 )
|
|
if ( op_ptr->prec == E7 )
|
|
RE_panic("parser returns E7") ;
|
|
else RE_error_trap(-op_ptr->prec) ;
|
|
|
|
if ( ++op_ptr == op_stack + STACKSZ ) /* stack overflow */
|
|
RE_error_trap(-E5) ;
|
|
op_ptr -> token = t ;
|
|
} /* end of switch */
|
|
|
|
if ( m_ptr == m_stack+(STACKSZ-1) ) /*overflow*/
|
|
RE_error_trap(-E5) ;
|
|
t = RE_lex(m_ptr+1) ;
|
|
}
|
|
}
|
|
|
|
|
|
/* getting here means a logic flaw or unforeseen case */
|
|
void RE_panic( s )
|
|
char *s ;
|
|
{ fprintf( stderr, "REcompile() - panic: %s\n", s) ;
|
|
exit(100) ; }
|
|
|