/******************************************** scan.c copyright 1991, Michael D. Brennan This is a source file for mawk, an implementation of the AWK programming language. Mawk is distributed without warranty under the terms of the GNU General Public License, version 2, 1991. ********************************************/ /* $Log: scan.c,v $ /* Revision 1.2 1993/07/02 23:57:54 jtc /* Updated to mawk 1.1.4 /* * Revision 5.4.1.1 1993/01/15 03:33:50 mike * patch3: safer double to int conversion * * Revision 5.4 1992/11/29 18:57:50 mike * field expressions convert to long so 16 bit and 32 bit * systems behave the same * * Revision 5.3 1992/07/08 15:43:41 brennan * patch2: length returns. I am a wimp * * Revision 5.2 1992/02/21 14:16:53 brennan * fix: getline <= * * Revision 5.1 91/12/05 07:56:27 brennan * 1.1 pre-release * */ #include "mawk.h" #include "sizes.h" #include "scan.h" #include "memory.h" #include "field.h" #include "init.h" #include "fin.h" #include "repl.h" #include "code.h" #if HAVE_FCNTL_H #include #endif #include "files.h" /* static functions */ static void PROTO(scan_fillbuff, (void) ) ; static void PROTO(scan_open, (void) ) ; static int PROTO(slow_next, (void) ) ; static void PROTO(eat_comment, (void) ) ; static void PROTO(eat_semi_colon, (void) ) ; static double PROTO(collect_decimal, (int, int *) ) ; static int PROTO(collect_string, (void) ) ; static int PROTO(collect_RE, (void) ) ; /*----------------------------- program file management *----------------------------*/ char *pfile_name ; STRING *program_string ; PFILE *pfile_list ; static unsigned char *buffer ; static unsigned char *buffp ; /* unsigned so it works with 8 bit chars */ static int program_fd ; static int eof_flag ; void scan_init(cmdline_program) char * cmdline_program ; { if ( cmdline_program ) { program_fd = -1 ; /* command line program */ program_string = new_STRING((char *)0, strlen(cmdline_program) + 1 ) ; (void) strcpy(program_string->str, cmdline_program) ; /* simulate file termination */ program_string->str[program_string->len-1] = '\n' ; buffp = (unsigned char *) program_string->str ; eof_flag = 1 ; } else /* program from file[s] */ { scan_open() ; buffp = buffer = (unsigned char *) zmalloc( BUFFSZ+1 ) ; scan_fillbuff() ; } eat_nl() ; /* scan to first token */ if ( next() == 0 ) { errmsg(0, "no program") ; mawk_exit(1) ; } un_next() ; } static void scan_open() /* open pfile_name */ { if ( pfile_name[0] == '-' && pfile_name[1] == 0 ) program_fd = 0 ; else if ( (program_fd = open(pfile_name, O_RDONLY, 0)) == -1 ) { errmsg( errno, "cannot open %s", pfile_name) ; mawk_exit(1) ; } } void scan_cleanup() { if ( program_fd >= 0 ) zfree(buffer, BUFFSZ+1) ; else free_STRING(program_string) ; if ( program_fd > 0 ) (void) close(program_fd) ; /* redefine SPACE as [ \t\n] */ scan_code['\n'] = posix_space_flag && rs_shadow.type != SEP_MLR ? SC_UNEXPECTED : SC_SPACE ; scan_code['\f'] = SC_UNEXPECTED ; /*value doesn't matter */ scan_code['\013'] = SC_UNEXPECTED ; /* \v not space */ scan_code['\r'] = SC_UNEXPECTED ; } /*-------------------------------- global variables shared by yyparse() and yylex() and used for error messages too *-------------------------------*/ int current_token = -1 ; unsigned token_lineno ; unsigned compile_error_count ; int NR_flag ; /* are we tracking NR */ int paren_cnt ; int brace_cnt ; int print_flag ; /* changes meaning of '>' */ int getline_flag ; /* changes meaning of '<' */ extern YYSTYPE yylval ; /*---------------------------------------- file reading functions next() and un_next(c) are macros in scan.h *---------------------*/ static unsigned lineno = 1 ; static void scan_fillbuff() { unsigned r ; r = fillbuff(program_fd, (char *)buffer, BUFFSZ) ; if ( r < BUFFSZ ) { eof_flag = 1 ; /* check eof is terminated */ if ( r && buffer[r-1] != '\n' ) { buffer[r] = '\n' ; buffer[r+1] = 0 ; } } } /* read one character -- slowly */ static int slow_next() { while ( *buffp == 0 ) { if ( !eof_flag ) { buffp = buffer ; scan_fillbuff() ; } else if ( pfile_list /* open another program file */ ) { PFILE *q ; if ( program_fd > 0 ) (void) close(program_fd) ; eof_flag = 0 ; pfile_name = pfile_list->fname ; q = pfile_list ; pfile_list = pfile_list->link ; ZFREE(q) ; scan_open() ; token_lineno = lineno = 1 ; } else break /* real eof */ ; } return *buffp++ ; /* note can un_next() , eof which is zero */ } static void eat_comment() { register int c ; while ( (c = next()) != '\n' && scan_code[c] ) ; un_next() ; } /* this is how we handle extra semi-colons that are now allowed to separate pattern-action blocks A proof that they are useless clutter to the language: we throw them away */ static void eat_semi_colon() /* eat one semi-colon on the current line */ { register int c ; while ( scan_code[c = next()] == SC_SPACE ) ; if ( c != ';' ) un_next() ; } void eat_nl() /* eat all space including newlines */ { while ( 1 ) switch( scan_code[next()] ) { case SC_COMMENT : eat_comment() ; break ; case SC_NL : lineno++ ; /* fall thru */ case SC_SPACE : break ; default : un_next() ; return ; } } int yylex() { register int c ; token_lineno = lineno ; reswitch: switch( scan_code[c = next()] ) { case 0 : ct_ret(EOF) ; case SC_SPACE : goto reswitch ; case SC_COMMENT : eat_comment() ; goto reswitch ; case SC_NL : lineno++ ; eat_nl() ; ct_ret(NL) ; case SC_ESCAPE : while ( scan_code[ c = next() ] == SC_SPACE ) ; if ( c == '\n') { token_lineno = ++lineno ; goto reswitch ; } if ( c == 0 ) ct_ret(EOF) ; un_next() ; yylval.ival = '\\' ; ct_ret(UNEXPECTED) ; case SC_SEMI_COLON : eat_nl() ; ct_ret(SEMI_COLON) ; case SC_LBRACE : eat_nl() ; brace_cnt++ ; ct_ret(LBRACE) ; case SC_PLUS : switch( next() ) { case '+' : yylval.ival = '+' ; string_buff[0] = string_buff[1] = '+' ; string_buff[2] = 0 ; ct_ret(INC_or_DEC) ; case '=' : ct_ret(ADD_ASG) ; default : un_next() ; ct_ret(PLUS) ; } case SC_MINUS : switch( next() ) { case '-' : yylval.ival = '-' ; string_buff[0] = string_buff[1] = '-' ; string_buff[2] = 0 ; ct_ret(INC_or_DEC) ; case '=' : ct_ret(SUB_ASG) ; default : un_next() ; ct_ret(MINUS) ; } case SC_COMMA : eat_nl() ; ct_ret(COMMA) ; case SC_MUL : test1_ret('=', MUL_ASG, MUL) ; case SC_DIV : { static int can_precede_div[] = { DOUBLE, STRING_, RPAREN, ID, D_ID, RE, RBOX, FIELD, GETLINE, INC_or_DEC, -1 } ; int *p = can_precede_div ; do if ( *p == current_token ) { if ( *p != INC_or_DEC ) test1_ret('=', DIV_ASG, DIV) ; if ( next() == '=' ) { un_next() ; ct_ret( collect_RE() ) ; } } while ( * ++p != -1 ) ; ct_ret( collect_RE() ) ; } case SC_MOD : test1_ret('=', MOD_ASG, MOD) ; case SC_POW : test1_ret('=' , POW_ASG, POW) ; case SC_LPAREN : paren_cnt++ ; ct_ret(LPAREN) ; case SC_RPAREN : if ( --paren_cnt < 0 ) { compile_error( "extra ')'" ) ; paren_cnt = 0 ; goto reswitch ; } ct_ret(RPAREN) ; case SC_LBOX : ct_ret(LBOX) ; case SC_RBOX : ct_ret(RBOX) ; case SC_MATCH : string_buff[0] = '~' ; string_buff[0] = 0 ; yylval.ival = 1 ; ct_ret(MATCH) ; case SC_EQUAL : test1_ret( '=', EQ, ASSIGN ) ; case SC_NOT : /* ! */ if ( (c = next()) == '~' ) { string_buff[0] = '!' ; string_buff[1] = '~' ; string_buff[2] = 0 ; yylval.ival = 0 ; ct_ret(MATCH) ; } else if ( c == '=' ) ct_ret(NEQ) ; un_next() ; ct_ret(NOT) ; case SC_LT : /* '<' */ if ( next() == '=' ) ct_ret(LTE) ; else un_next() ; if ( getline_flag ) { getline_flag = 0 ; ct_ret(IO_IN) ; } else ct_ret(LT) ; case SC_GT : /* '>' */ if ( print_flag && paren_cnt == 0 ) { print_flag = 0 ; /* there are 3 types of IO_OUT -- build the error string in string_buff */ string_buff[0] = '>' ; if ( next() == '>' ) { yylval.ival = F_APPEND ; string_buff[1] = '>' ; string_buff[2] = 0 ; } else { un_next() ; yylval.ival = F_TRUNC ; string_buff[1] = 0 ; } return current_token = IO_OUT ; } test1_ret('=', GTE, GT) ; case SC_OR : if ( next() == '|' ) { eat_nl() ; ct_ret(OR) ; } else { un_next() ; if ( print_flag && paren_cnt == 0 ) { print_flag = 0 ; yylval.ival = PIPE_OUT; string_buff[0] = '|' ; string_buff[1] = 0 ; ct_ret(IO_OUT) ; } else ct_ret(PIPE) ; } case SC_AND : if ( next() == '&' ) { eat_nl() ; ct_ret(AND) ; } else { un_next() ; yylval.ival = '&' ; ct_ret(UNEXPECTED) ; } case SC_QMARK : ct_ret(QMARK) ; case SC_COLON : ct_ret(COLON) ; case SC_RBRACE : if ( --brace_cnt < 0 ) { compile_error("extra '}'" ) ; eat_semi_colon() ; brace_cnt = 0 ; goto reswitch ; } if ( (c = current_token) == NL || c == SEMI_COLON || c == SC_FAKE_SEMI_COLON || c == RBRACE ) { /* if the brace_cnt is zero , we've completed a pattern action block. If the user insists on adding a semi-colon on the same line we will eat it. Note what we do below: physical law -- conservation of semi-colons */ if ( brace_cnt == 0 ) eat_semi_colon() ; eat_nl() ; ct_ret(RBRACE) ; } /* supply missing semi-colon to statement that precedes a '}' */ brace_cnt++ ; un_next() ; current_token = SC_FAKE_SEMI_COLON ; return SEMI_COLON ; case SC_DIGIT : case SC_DOT : { double d ; int flag ; static double double_zero = 0.0 ; static double double_one = 1.0 ; if ( (d = collect_decimal(c, &flag)) == 0.0 ) if ( flag ) ct_ret(flag) ; else yylval.ptr = (PTR) &double_zero ; else if ( d == 1.0 ) yylval.ptr = (PTR) &double_one ; else { yylval.ptr = (PTR) ZMALLOC(double) ; *(double*)yylval.ptr = d ; } ct_ret( DOUBLE ) ; } case SC_DOLLAR : /* '$' */ { double d ; int flag ; while ( scan_code[c = next()] == SC_SPACE ) ; if ( scan_code[c] != SC_DIGIT && scan_code[c] != SC_DOT ) { un_next() ; ct_ret(DOLLAR) ; } /* compute field address at compile time */ if ( (d = collect_decimal(c, &flag)) == 0.0 ) if ( flag ) ct_ret(flag) ; /* an error */ else yylval.cp = &field[0] ; else { if ( d > MAX_FIELD ) { compile_error( "$%g exceeds maximum field(%d)" , d, MAX_FIELD) ; d = MAX_FIELD ; } yylval.cp = field_ptr((int)d) ; } ct_ret(FIELD) ; } case SC_DQUOTE : return current_token = collect_string() ; case SC_IDCHAR : /* collect an identifier */ { unsigned char *p = (unsigned char *)string_buff + 1 ; SYMTAB *stp ; string_buff[0] = c ; while ( (c = scan_code[ *p++ = next()]) == SC_IDCHAR || c == SC_DIGIT ) ; un_next() ; * --p = 0 ; switch( (stp = find(string_buff))->type ) { case ST_NONE : /* check for function call before defined */ if ( next() == '(' ) { stp->type = ST_FUNCT ; stp->stval.fbp = (FBLOCK *) zmalloc(sizeof(FBLOCK)) ; stp->stval.fbp->name = stp->name ; stp->stval.fbp->code = (INST *) 0 ; yylval.fbp = stp->stval.fbp ; current_token = FUNCT_ID ; } else { yylval.stp = stp ; current_token = current_token == DOLLAR ? D_ID : ID ; } un_next() ; break ; case ST_NR : NR_flag = 1 ; stp->type = ST_VAR ; /* fall thru */ case ST_VAR : case ST_ARRAY : case ST_LOCAL_NONE : case ST_LOCAL_VAR : case ST_LOCAL_ARRAY : yylval.stp = stp ; current_token = current_token == DOLLAR ? D_ID : ID ; break ; case ST_ENV : stp->type = ST_ARRAY ; stp->stval.array = new_ARRAY() ; load_environ(stp->stval.array) ; yylval.stp = stp ; current_token = current_token == DOLLAR ? D_ID : ID ; break ; case ST_FUNCT : yylval.fbp = stp->stval.fbp ; current_token = FUNCT_ID ; break ; case ST_KEYWORD : current_token = stp->stval.kw ; break ; case ST_BUILTIN : yylval.bip = stp->stval.bip ; current_token = BUILTIN ; break ; case ST_LENGTH : yylval.bip = stp->stval.bip ; /* check for length alone, this is an ugly hack */ while ( scan_code[ c = next() ] == SC_SPACE ) ; un_next() ; current_token = c == '(' ? BUILTIN : LENGTH ; break ; case ST_FIELD : yylval.cp = stp->stval.cp ; current_token = FIELD ; break ; default : bozo("find returned bad st type") ; } return current_token ; } case SC_UNEXPECTED : yylval.ival = c & 0xff ; ct_ret(UNEXPECTED) ; } return 0 ; /* never get here make lint happy */ } /* collect a decimal constant in temp_buff. Return the value and error conditions by reference */ static double collect_decimal(c, flag) int c ; int *flag ; { register unsigned char *p = (unsigned char*) string_buff + 1; unsigned char *endp ; double d ; *flag = 0 ; string_buff[0] = c ; if ( c == '.' ) { if ( scan_code[*p++ = next()] != SC_DIGIT ) { *flag = UNEXPECTED ; yylval.ival = '.' ; return 0.0 ; } } else { while ( scan_code[*p++ = next()] == SC_DIGIT ) ; if ( p[-1] != '.' ) { un_next() ; p-- ; } } /* get rest of digits after decimal point */ while ( scan_code[*p++ = next()] == SC_DIGIT ) ; /* check for exponent */ if ( p[-1] != 'e' && p[-1] != 'E' ) { un_next() ; * --p = 0 ; } else /* get the exponent */ if ( scan_code[*p = next()] != SC_DIGIT && *p != '-' && *p != '+' ) { *++p = 0 ; *flag = BAD_DECIMAL ; return 0.0 ; } else /* get the rest of the exponent */ { p++ ; while ( scan_code[*p++ = next()] == SC_DIGIT ) ; un_next() ; * --p = 0 ; } errno = 0 ; /* check for overflow/underflow */ d = strtod( string_buff, (char **)&endp ) ; #ifndef STRTOD_UNDERFLOW_ON_ZERO_BUG if ( errno ) compile_error( "%s : decimal %sflow" , string_buff, d == 0.0 ? "under" : "over") ; #else /* sun4 bug */ if ( errno && d != 0.0 ) compile_error( "%s : decimal overflow", string_buff) ; #endif if ( endp < p ) { *flag = BAD_DECIMAL ; return 0.0 ; } return d ; } /*---------- process escape characters ---------------*/ static char hex_val['f' - 'A' + 1] = { 10,11,12,13,14,15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10,11,12,13,14,15 } ; #define isoctal(x) ((x)>='0'&&(x)<='7') #define hex_value(x) hex_val[(x)-'A'] #define ishex(x) (scan_code[x] == SC_DIGIT ||\ 'A' <= (x) && (x) <= 'f' && hex_value(x)) static int PROTO(octal, (char **)) ; static int PROTO(hex, (char **)) ; /* process one , two or three octal digits moving a pointer forward by reference */ static int octal( start_p ) char **start_p ; { register char *p = *start_p ; register unsigned x ; x = *p++ - '0' ; if ( isoctal(*p) ) { x = (x<<3) + *p++ - '0' ; if ( isoctal(*p) ) x = (x<<3) + *p++ - '0' ; } *start_p = p ; return x & 0xff ; } /* process one or two hex digits moving a pointer forward by reference */ static int hex( start_p ) char **start_p ; { register unsigned char *p = (unsigned char*) *start_p ; register unsigned x ; unsigned t ; if ( scan_code[*p] == SC_DIGIT ) x = *p++ - '0' ; else x = hex_value(*p++) ; if ( scan_code[*p] == SC_DIGIT ) x = (x<<4) + *p++ - '0' ; else if ( 'A' <= *p && *p <= 'f' && (t = hex_value(*p)) ) { x = (x<<4) + t ; p++ ; } *start_p = (char *) p ; return x ; } #define ET_END 9 static struct { char in , out ; } escape_test[ET_END+1] = { 'n' , '\n', 't' , '\t', 'f' , '\f', 'b' , '\b', 'r' , '\r', 'a' , '\07', 'v' , '\013', '\\', '\\', '\"', '\"', 0 , 0 } ; /* process the escape characters in a string, in place . */ char *rm_escape(s) char *s ; { register char *p, *q ; char *t ; int i ; q = p = s ; while ( *p ) if ( *p == '\\' ) { escape_test[ET_END].in = * ++p ; /* sentinal */ i = 0 ; while ( escape_test[i].in != *p ) i++ ; if ( i != ET_END ) /* in table */ { p++ ; *q++ = escape_test[i].out ; } else if ( isoctal(*p) ) { t = p ; *q++ = octal(&t) ; p = t ; } else if ( *p == 'x' && ishex(*(unsigned char*)(p+1)) ) { t = p+1 ; *q++ = hex(&t) ; p = t ; } else if ( *p == 0 ) /* can only happen with command line assign */ *q++ = '\\' ; else /* not an escape sequence */ { *q++ = '\\' ; *q++ = *p++ ; } } else *q++ = *p++ ; *q = 0 ; return s ; } static int collect_string() { register unsigned char *p = (unsigned char *)string_buff ; int c ; int e_flag = 0 ; /* on if have an escape char */ while ( 1 ) switch( scan_code[ *p++ = next() ] ) { case SC_DQUOTE : /* done */ * --p = 0 ; goto out ; case SC_NL : p[-1] = 0 ; /* fall thru */ case 0 : /* unterminated string */ compile_error( "runaway string constant \"%.10s ..." , string_buff, token_lineno ) ; mawk_exit(1) ; case SC_ESCAPE : if ( (c = next()) == '\n' ) { p-- ; lineno++ ; } else if ( c == 0 ) un_next() ; else { *p++ = c ; e_flag = 1 ; } break ; default : break ; } out: yylval.ptr = (PTR) new_STRING( e_flag ? rm_escape( string_buff ) : string_buff ) ; return STRING_ ; } static int collect_RE() { register unsigned char *p = (unsigned char*) string_buff ; int c ; STRING *sval ; while ( 1 ) switch( scan_code[ *p++ = next() ] ) { case SC_DIV : /* done */ * --p = 0 ; goto out ; case SC_NL : p[-1] = 0 ; /* fall thru */ case 0 : /* unterminated re */ compile_error( "runaway regular expression /%.10s ..." , string_buff, token_lineno ) ; mawk_exit(1) ; case SC_ESCAPE : switch( c = next() ) { case '/' : p[-1] = '/' ; break ; case '\n' : p-- ; break ; case 0 : un_next() ; break ; default : *p++ = c ; break ; } break ; } out: /* now we've got the RE, so compile it */ sval = new_STRING( string_buff ) ; yylval.ptr = re_compile(sval) ; free_STRING(sval) ; return RE ; }