NetBSD/usr.bin/indent/lexi.c
agc b69f871588 Bump number of elements in specials array from 100 to 1000.
Typedefs are added to this array, and it silently ignores
any attempts to enter more elements when the array is full.
1997-09-09 09:28:19 +00:00

563 lines
15 KiB
C

/* $NetBSD: lexi.c,v 1.4 1997/09/09 09:28:19 agc Exp $ */
/*
* Copyright (c) 1985 Sun Microsystems, Inc.
* Copyright (c) 1980 The Regents of the University of California.
* Copyright (c) 1976 Board of Trustees of the University of Illinois.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef lint
/*static char sccsid[] = "from: @(#)lexi.c 5.16 (Berkeley) 2/26/91";*/
static char rcsid[] = "$NetBSD: lexi.c,v 1.4 1997/09/09 09:28:19 agc Exp $";
#endif /* not lint */
/*
* Here we have the token scanner for indent. It scans off one token and puts
* it in the global variable "token". It returns a code, indicating the type
* of token scanned.
*/
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "indent_globs.h"
#include "indent_codes.h"
#define alphanum 1
#define opchar 3
struct templ {
char *rwd;
int rwcode;
};
struct templ specials[1000] =
{
"switch", 1,
"case", 2,
"break", 0,
"struct", 3,
"union", 3,
"enum", 3,
"default", 2,
"int", 4,
"char", 4,
"float", 4,
"double", 4,
"long", 4,
"short", 4,
"typdef", 4,
"unsigned", 4,
"register", 4,
"static", 4,
"global", 4,
"extern", 4,
"void", 4,
"goto", 0,
"return", 0,
"if", 5,
"while", 5,
"for", 5,
"else", 6,
"do", 6,
"sizeof", 7,
0, 0
};
char chartype[128] =
{ /* this is used to facilitate the decision of
* what type (alphanumeric, operator) each
* character is */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 0, 0, 1, 3, 3, 0,
0, 0, 3, 3, 0, 3, 0, 3,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 0, 3, 3, 3, 3,
0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 0, 0, 3, 1,
0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 3, 0, 3, 0
};
int
lexi()
{
int unary_delim; /* this is set to 1 if the current token
*
* forces a following operator to be unary */
static int last_code; /* the last token type returned */
static int l_struct; /* set to 1 if the last token was 'struct' */
int code; /* internal code to be returned */
char qchar; /* the delimiter character for a string */
e_token = s_token; /* point to start of place to save token */
unary_delim = false;
ps.col_1 = ps.last_nl; /* tell world that this token started in
* column 1 iff the last thing scanned was nl */
ps.last_nl = false;
while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
ps.col_1 = false; /* leading blanks imply token is not in column
* 1 */
if (++buf_ptr >= buf_end)
fill_buffer();
}
/* Scan an alphanumeric token */
if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
/*
* we have a character or number
*/
register char *j; /* used for searching thru list of
*
* reserved words */
register struct templ *p;
if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
int seendot = 0,
seenexp = 0;
if (*buf_ptr == '0' &&
(buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
*e_token++ = *buf_ptr++;
*e_token++ = *buf_ptr++;
while (isxdigit(*buf_ptr)) {
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
}
}
else
while (1) {
if (*buf_ptr == '.')
if (seendot)
break;
else
seendot++;
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
if (!isdigit(*buf_ptr) && *buf_ptr != '.')
if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
break;
else {
seenexp++;
seendot++;
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
if (*buf_ptr == '+' || *buf_ptr == '-')
*e_token++ = *buf_ptr++;
}
}
if (*buf_ptr == 'L' || *buf_ptr == 'l')
*e_token++ = *buf_ptr++;
}
else
while (chartype[*buf_ptr] == alphanum) { /* copy it over */
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
if (buf_ptr >= buf_end)
fill_buffer();
}
*e_token++ = '\0';
while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
if (++buf_ptr >= buf_end)
fill_buffer();
}
ps.its_a_keyword = false;
ps.sizeof_keyword = false;
if (l_struct) { /* if last token was 'struct', then this token
* should be treated as a declaration */
l_struct = false;
last_code = ident;
ps.last_u_d = true;
return (decl);
}
ps.last_u_d = false; /* Operator after indentifier is binary */
last_code = ident; /* Remember that this is the code we will
* return */
/*
* This loop will check if the token is a keyword.
*/
for (p = specials; (j = p->rwd) != 0; p++) {
register char *p = s_token; /* point at scanned token */
if (*j++ != *p++ || *j++ != *p++)
continue; /* This test depends on the fact that
* identifiers are always at least 1 character
* long (ie. the first two bytes of the
* identifier are always meaningful) */
if (p[-1] == 0)
break; /* If its a one-character identifier */
while (*p++ == *j)
if (*j++ == 0)
goto found_keyword; /* I wish that C had a multi-level
* break... */
}
if (p->rwd) { /* we have a keyword */
found_keyword:
ps.its_a_keyword = true;
ps.last_u_d = true;
switch (p->rwcode) {
case 1: /* it is a switch */
return (swstmt);
case 2: /* a case or default */
return (casestmt);
case 3: /* a "struct" */
if (ps.p_l_follow)
break; /* inside parens: cast */
l_struct = true;
/*
* Next time around, we will want to know that we have had a
* 'struct'
*/
case 4: /* one of the declaration keywords */
if (ps.p_l_follow) {
ps.cast_mask |= 1 << ps.p_l_follow;
break; /* inside parens: cast */
}
last_code = decl;
return (decl);
case 5: /* if, while, for */
return (sp_paren);
case 6: /* do, else */
return (sp_nparen);
case 7:
ps.sizeof_keyword = true;
default: /* all others are treated like any other
* identifier */
return (ident);
} /* end of switch */
} /* end of if (found_it) */
if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
register char *tp = buf_ptr;
while (tp < buf_end)
if (*tp++ == ')' && (*tp == ';' || *tp == ','))
goto not_proc;
strncpy(ps.procname, token, sizeof ps.procname - 1);
ps.in_parameter_declaration = 1;
rparen_count = 1;
not_proc:;
}
/*
* The following hack attempts to guess whether or not the current
* token is in fact a declaration keyword -- one that has been
* typedefd
*/
if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
&& !ps.p_l_follow
&& !ps.block_init
&& (ps.last_token == rparen || ps.last_token == semicolon ||
ps.last_token == decl ||
ps.last_token == lbrace || ps.last_token == rbrace)) {
ps.its_a_keyword = true;
ps.last_u_d = true;
last_code = decl;
return decl;
}
if (last_code == decl) /* if this is a declared variable, then
* following sign is unary */
ps.last_u_d = true; /* will make "int a -1" work */
last_code = ident;
return (ident); /* the ident is not in the list */
} /* end of procesing for alpanum character */
/* Scan a non-alphanumeric token */
*e_token++ = *buf_ptr; /* if it is only a one-character token, it is
* moved here */
*e_token = '\0';
if (++buf_ptr >= buf_end)
fill_buffer();
switch (*token) {
case '\n':
unary_delim = ps.last_u_d;
ps.last_nl = true; /* remember that we just had a newline */
code = (had_eof ? 0 : newline);
/*
* if data has been exausted, the newline is a dummy, and we should
* return code to stop
*/
break;
case '\'': /* start of quoted character */
case '"': /* start of string */
qchar = *token;
if (troff) {
e_token[-1] = '`';
if (qchar == '"')
*e_token++ = '`';
e_token = chfont(&bodyf, &stringf, e_token);
}
do { /* copy the string */
while (1) { /* move one character or [/<char>]<char> */
if (*buf_ptr == '\n') {
printf("%d: Unterminated literal\n", line_no);
goto stop_lit;
}
CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
* since CHECK_SIZE guarantees that there
* are at least 5 entries left */
*e_token = *buf_ptr++;
if (buf_ptr >= buf_end)
fill_buffer();
if (*e_token == BACKSLASH) { /* if escape, copy extra char */
if (*buf_ptr == '\n') /* check for escaped newline */
++line_no;
if (troff) {
*++e_token = BACKSLASH;
if (*buf_ptr == BACKSLASH)
*++e_token = BACKSLASH;
}
*++e_token = *buf_ptr++;
++e_token; /* we must increment this again because we
* copied two chars */
if (buf_ptr >= buf_end)
fill_buffer();
}
else
break; /* we copied one character */
} /* end of while (1) */
} while (*e_token++ != qchar);
if (troff) {
e_token = chfont(&stringf, &bodyf, e_token - 1);
if (qchar == '"')
*e_token++ = '\'';
}
stop_lit:
code = ident;
break;
case ('('):
case ('['):
unary_delim = true;
code = lparen;
break;
case (')'):
case (']'):
code = rparen;
break;
case '#':
unary_delim = ps.last_u_d;
code = preesc;
break;
case '?':
unary_delim = true;
code = question;
break;
case (':'):
code = colon;
unary_delim = true;
break;
case (';'):
unary_delim = true;
code = semicolon;
break;
case ('{'):
unary_delim = true;
/*
* if (ps.in_or_st) ps.block_init = 1;
*/
/* ? code = ps.block_init ? lparen : lbrace; */
code = lbrace;
break;
case ('}'):
unary_delim = true;
/* ? code = ps.block_init ? rparen : rbrace; */
code = rbrace;
break;
case 014: /* a form feed */
unary_delim = ps.last_u_d;
ps.last_nl = true; /* remember this so we can set 'ps.col_1'
* right */
code = form_feed;
break;
case (','):
unary_delim = true;
code = comma;
break;
case '.':
unary_delim = false;
code = period;
break;
case '-':
case '+': /* check for -, +, --, ++ */
code = (ps.last_u_d ? unary_op : binary_op);
unary_delim = true;
if (*buf_ptr == token[0]) {
/* check for doubled character */
*e_token++ = *buf_ptr++;
/* buffer overflow will be checked at end of loop */
if (last_code == ident || last_code == rparen) {
code = (ps.last_u_d ? unary_op : postop);
/* check for following ++ or -- */
unary_delim = false;
}
}
else if (*buf_ptr == '=')
/* check for operator += */
*e_token++ = *buf_ptr++;
else if (*buf_ptr == '>') {
/* check for operator -> */
*e_token++ = *buf_ptr++;
if (!pointer_as_binop) {
unary_delim = false;
code = unary_op;
ps.want_blank = false;
}
}
break; /* buffer overflow will be checked at end of
* switch */
case '=':
if (ps.in_or_st)
ps.block_init = 1;
#ifdef undef
if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
e_token[-1] = *buf_ptr++;
if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
*e_token++ = *buf_ptr++;
*e_token++ = '='; /* Flip =+ to += */
*e_token = 0;
}
#else
if (*buf_ptr == '=') {/* == */
*e_token++ = '='; /* Flip =+ to += */
buf_ptr++;
*e_token = 0;
}
#endif
code = binary_op;
unary_delim = true;
break;
/* can drop thru!!! */
case '>':
case '<':
case '!': /* ops like <, <<, <=, !=, etc */
if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
*e_token++ = *buf_ptr;
if (++buf_ptr >= buf_end)
fill_buffer();
}
if (*buf_ptr == '=')
*e_token++ = *buf_ptr++;
code = (ps.last_u_d ? unary_op : binary_op);
unary_delim = true;
break;
default:
if (token[0] == '/' && *buf_ptr == '*') {
/* it is start of comment */
*e_token++ = '*';
if (++buf_ptr >= buf_end)
fill_buffer();
code = comment;
unary_delim = ps.last_u_d;
break;
}
while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
/*
* handle ||, &&, etc, and also things as in int *****i
*/
*e_token++ = *buf_ptr;
if (++buf_ptr >= buf_end)
fill_buffer();
}
code = (ps.last_u_d ? unary_op : binary_op);
unary_delim = true;
} /* end of switch */
if (code != newline) {
l_struct = false;
last_code = code;
}
if (buf_ptr >= buf_end) /* check for input buffer empty */
fill_buffer();
ps.last_u_d = unary_delim;
*e_token = '\0'; /* null terminate the token */
return (code);
}
/*
* Add the given keyword to the keyword table, using val as the keyword type
*/
addkey(key, val)
char *key;
{
register struct templ *p = specials;
while (p->rwd)
if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
return;
else
p++;
if (p >= specials + sizeof specials / sizeof specials[0])
return; /* For now, table overflows are silently
* ignored */
p->rwd = key;
p->rwcode = val;
p[1].rwd = 0;
p[1].rwcode = 0;
return;
}