optimized token strings - optimized token hashing

This commit is contained in:
bellard 2002-11-23 22:02:40 +00:00
parent 8901fbeef1
commit 9e180474f0

258
tcc.c
View File

@ -80,6 +80,9 @@
#define TOK_HASH_SIZE 2048 /* must be a power of two */ #define TOK_HASH_SIZE 2048 /* must be a power of two */
#define TOK_ALLOC_INCR 512 /* must be a power of two */ #define TOK_ALLOC_INCR 512 /* must be a power of two */
#define TOK_STR_ALLOC_INCR_BITS 6
#define TOK_STR_ALLOC_INCR (1 << TOK_STR_ALLOC_INCR_BITS)
#define TOK_MAX_SIZE 4 /* token max size in int unit when stored in string */
/* token symbol management */ /* token symbol management */
typedef struct TokenSym { typedef struct TokenSym {
@ -238,6 +241,7 @@ typedef struct ParseState {
typedef struct TokenString { typedef struct TokenString {
int *str; int *str;
int len; int len;
int allocated_len;
int last_line_num; int last_line_num;
} TokenString; } TokenString;
@ -1115,26 +1119,11 @@ void test_lvalue(void)
expect("lvalue"); expect("lvalue");
} }
TokenSym *tok_alloc(const char *str, int len) /* allocate a new token */
static TokenSym *tok_alloc_new(TokenSym **pts, const char *str, int len)
{ {
TokenSym *ts, **pts, **ptable; TokenSym *ts, **ptable;
int i; int i;
unsigned int h;
h = 1;
for(i=0;i<len;i++)
h = h * 263 + ((unsigned char *)str)[i];
h &= (TOK_HASH_SIZE - 1);
pts = &hash_ident[h];
while (1) {
ts = *pts;
if (!ts)
break;
if (ts->len == len && !memcmp(ts->str, str, len))
return ts;
pts = &(ts->hash_next);
}
if (tok_ident >= SYM_FIRST_ANOM) if (tok_ident >= SYM_FIRST_ANOM)
error("memory full"); error("memory full");
@ -1163,6 +1152,33 @@ TokenSym *tok_alloc(const char *str, int len)
return ts; return ts;
} }
#define TOK_HASH_INIT 1
#define TOK_HASH_FUNC(h, c) ((h) * 263 + (c))
/* find a token and add it if not found */
static TokenSym *tok_alloc(const char *str, int len)
{
TokenSym *ts, **pts;
int i;
unsigned int h;
h = TOK_HASH_INIT;
for(i=0;i<len;i++)
h = TOK_HASH_FUNC(h, ((unsigned char *)str)[i]);
h &= (TOK_HASH_SIZE - 1);
pts = &hash_ident[h];
for(;;) {
ts = *pts;
if (!ts)
break;
if (ts->len == len && !memcmp(ts->str, str, len))
return ts;
pts = &(ts->hash_next);
}
return tok_alloc_new(pts, str, len);
}
/* CString handling */ /* CString handling */
static void cstr_realloc(CString *cstr, int new_size) static void cstr_realloc(CString *cstr, int new_size)
@ -1904,6 +1920,7 @@ static inline void tok_str_new(TokenString *s)
{ {
s->str = NULL; s->str = NULL;
s->len = 0; s->len = 0;
s->allocated_len = 0;
s->last_line_num = -1; s->last_line_num = -1;
} }
@ -1915,52 +1932,114 @@ static void tok_str_free(int *str)
p = str; p = str;
for(;;) { for(;;) {
t = *p++; t = *p;
/* NOTE: we test zero separately so that GCC can generate a
table for the following switch */
if (t == 0) if (t == 0)
break; break;
if (t == TOK_STR || t == TOK_LSTR || t == TOK_PPNUM) { switch(t) {
case TOK_CINT:
case TOK_CUINT:
case TOK_CCHAR:
case TOK_LCHAR:
case TOK_CFLOAT:
case TOK_LINENUM:
p += 2;
break;
case TOK_PPNUM:
case TOK_STR:
case TOK_LSTR:
/* XXX: use a macro to be portable on 64 bit ? */ /* XXX: use a macro to be portable on 64 bit ? */
cstr = (CString *)(*p++); cstr = (CString *)p[1];
cstr_free(cstr); cstr_free(cstr);
tcc_free(cstr); tcc_free(cstr);
} else { p += 2;
p += tok_ext_size(t); break;
case TOK_CDOUBLE:
case TOK_CLLONG:
case TOK_CULLONG:
p += 3;
break;
case TOK_CLDOUBLE:
p += 1 + (LDOUBLE_SIZE / 4);
break;
default:
p++;
break;
} }
} }
tcc_free(str); tcc_free(str);
} }
static int *tok_str_realloc(TokenString *s)
{
int *str, len;
len = s->allocated_len + TOK_STR_ALLOC_INCR;
str = tcc_realloc(s->str, len * sizeof(int));
if (!str)
error("memory full");
s->allocated_len = len;
s->str = str;
return str;
}
static void tok_str_add(TokenString *s, int t) static void tok_str_add(TokenString *s, int t)
{ {
int len, *str; int len, *str;
len = s->len; len = s->len;
str = s->str; str = s->str;
if ((len & 63) == 0) { if (len >= s->allocated_len)
str = tcc_realloc(str, (len + 64) * sizeof(int)); str = tok_str_realloc(s);
if (!str)
return;
s->str = str;
}
str[len++] = t; str[len++] = t;
s->len = len; s->len = len;
} }
static void tok_str_add2(TokenString *s, int t, CValue *cv) static void tok_str_add2(TokenString *s, int t, CValue *cv)
{ {
int n, i; int len, *str;
CValue cv1;
tok_str_add(s, t); len = s->len;
if (t == TOK_STR || t == TOK_LSTR || t == TOK_PPNUM) { str = s->str;
/* special case: need to duplicate string */
cv1.cstr = cstr_dup(cv->cstr); /* allocate space for worst case */
tok_str_add(s, cv1.tab[0]); if (len + TOK_MAX_SIZE > s->allocated_len)
} else { str = tok_str_realloc(s);
n = tok_ext_size(t); str[len++] = t;
for(i=0;i<n;i++) switch(t) {
tok_str_add(s, cv->tab[i]); case TOK_CINT:
case TOK_CUINT:
case TOK_CCHAR:
case TOK_LCHAR:
case TOK_CFLOAT:
case TOK_LINENUM:
str[len++] = cv->tab[0];
break;
case TOK_PPNUM:
case TOK_STR:
case TOK_LSTR:
str[len++] = (int)cstr_dup(cv->cstr);
break;
case TOK_CDOUBLE:
case TOK_CLLONG:
case TOK_CULLONG:
str[len++] = cv->tab[0];
str[len++] = cv->tab[1];
break;
case TOK_CLDOUBLE:
#if LDOUBLE_SIZE == 12
str[len++] = cv->tab[0];
str[len++] = cv->tab[1];
str[len++] = cv->tab[2];
#else
#error add long double size support
#endif
break;
default:
break;
} }
s->len = len;
} }
/* add the current parse token in token string 's' */ /* add the current parse token in token string 's' */
@ -1977,18 +2056,47 @@ static void tok_str_add_tok(TokenString *s)
tok_str_add2(s, tok, &tokc); tok_str_add2(s, tok, &tokc);
} }
/* get a token from an integer array and increment pointer accordingly */ #if LDOUBLE_SIZE == 12
static int tok_get(int **tok_str, CValue *cv) #define LDOUBLE_GET(p, cv) \
{ cv.tab[0] = p[0]; \
int *p, t, n, i; cv.tab[1] = p[1]; \
cv.tab[2] = p[2];
#else
#error add long double size support
#endif
p = *tok_str;
t = *p++; /* get a token from an integer array and increment pointer
n = tok_ext_size(t); accordingly. we code it as a macro to avoid pointer aliasing. */
for(i=0;i<n;i++) #define TOK_GET(t, p, cv) \
cv->tab[i] = *p++; { \
*tok_str = p; t = *p++; \
return t; switch(t) { \
case TOK_CINT: \
case TOK_CUINT: \
case TOK_CCHAR: \
case TOK_LCHAR: \
case TOK_CFLOAT: \
case TOK_LINENUM: \
case TOK_STR: \
case TOK_LSTR: \
case TOK_PPNUM: \
cv.tab[0] = *p++; \
break; \
case TOK_CDOUBLE: \
case TOK_CLLONG: \
case TOK_CULLONG: \
cv.tab[0] = p[0]; \
cv.tab[1] = p[1]; \
p += 2; \
break; \
case TOK_CLDOUBLE: \
LDOUBLE_GET(p, cv); \
p += LDOUBLE_SIZE / 4; \
break; \
default: \
break; \
} \
} }
/* defines handling */ /* defines handling */
@ -2059,7 +2167,7 @@ static Sym *label_push(int v, int flags)
} }
/* eval an expression for #if/#elif */ /* eval an expression for #if/#elif */
int expr_preprocess(void) static int expr_preprocess(void)
{ {
int c, t; int c, t;
TokenString str; TokenString str;
@ -2096,13 +2204,13 @@ int expr_preprocess(void)
} }
#if defined(PARSE_DEBUG) || defined(PP_DEBUG) #if defined(PARSE_DEBUG) || defined(PP_DEBUG)
void tok_print(int *str) static void tok_print(int *str)
{ {
int t; int t;
CValue cval; CValue cval;
while (1) { while (1) {
t = tok_get(&str, &cval); TOK_GET(t, str, cval);
if (!t) if (!t)
break; break;
printf(" %s", get_tok_str(t, &cval)); printf(" %s", get_tok_str(t, &cval));
@ -2876,6 +2984,7 @@ static inline void next_nomacro1(void)
int b, t, c; int b, t, c;
TokenSym *ts; TokenSym *ts;
uint8_t *p, *p1; uint8_t *p, *p1;
unsigned int h;
p = file->buf_ptr; p = file->buf_ptr;
redo_no_start: redo_no_start:
@ -2944,10 +3053,10 @@ static inline void next_nomacro1(void)
break; break;
case '\n': case '\n':
file->line_num++;
if (return_linefeed) { if (return_linefeed) {
tok = TOK_LINEFEED; tok = TOK_LINEFEED;
} else { } else {
file->line_num++;
tok_flags |= TOK_FLAG_BOL; tok_flags |= TOK_FLAG_BOL;
p++; p++;
goto redo_no_start; goto redo_no_start;
@ -2989,16 +3098,35 @@ static inline void next_nomacro1(void)
case '_': case '_':
parse_ident_fast: parse_ident_fast:
p1 = p; p1 = p;
h = TOK_HASH_INIT;
h = TOK_HASH_FUNC(h, c);
p++; p++;
for(;;) { for(;;) {
c = *p; c = *p;
if (!isid(c) && !isnum(c)) if (!isid(c) && !isnum(c))
break; break;
h = TOK_HASH_FUNC(h, c);
p++; p++;
} }
if (c != '\\') { if (c != '\\') {
/* fast case : no stray found, so we have the full token */ TokenSym **pts;
ts = tok_alloc(p1, p - p1); int len;
/* fast case : no stray found, so we have the full token
and we have already hashed it */
len = p - p1;
h &= (TOK_HASH_SIZE - 1);
pts = &hash_ident[h];
for(;;) {
ts = *pts;
if (!ts)
break;
if (ts->len == len && !memcmp(ts->str, p1, len))
goto token_found;
pts = &(ts->hash_next);
}
ts = tok_alloc_new(pts, p1, len);
token_found: ;
} else { } else {
/* slower case */ /* slower case */
cstr_reset(&tokcstr); cstr_reset(&tokcstr);
@ -3019,8 +3147,8 @@ static inline void next_nomacro1(void)
tok = ts->tok; tok = ts->tok;
break; break;
case 'L': case 'L':
c = p[1]; t = p[1];
if (c != '\\' && c != '\'' && c != '\"') { if (t != '\\' && t != '\'' && t != '\"') {
/* fast case */ /* fast case */
goto parse_ident_fast; goto parse_ident_fast;
} else { } else {
@ -3268,7 +3396,7 @@ static void next_nomacro(void)
redo: redo:
tok = *macro_ptr; tok = *macro_ptr;
if (tok) { if (tok) {
tok = tok_get(&macro_ptr, &tokc); TOK_GET(tok, macro_ptr, tokc);
if (tok == TOK_LINENUM) { if (tok == TOK_LINENUM) {
file->line_num = tokc.i; file->line_num = tokc.i;
goto redo; goto redo;
@ -3291,12 +3419,12 @@ static int *macro_arg_subst(Sym **nested_list, int *macro_str, Sym *args)
tok_str_new(&str); tok_str_new(&str);
last_tok = 0; last_tok = 0;
while(1) { while(1) {
t = tok_get(&macro_str, &cval); TOK_GET(t, macro_str, cval);
if (!t) if (!t)
break; break;
if (t == '#') { if (t == '#') {
/* stringize */ /* stringize */
t = tok_get(&macro_str, &cval); TOK_GET(t, macro_str, cval);
if (!t) if (!t)
break; break;
s = sym_find2(args, t); s = sym_find2(args, t);
@ -3307,7 +3435,7 @@ static int *macro_arg_subst(Sym **nested_list, int *macro_str, Sym *args)
while (*st) { while (*st) {
if (notfirst) if (notfirst)
cstr_ccat(&cstr, ' '); cstr_ccat(&cstr, ' ');
t = tok_get(&st, &cval); TOK_GET(t, st, cval);
cstr_cat(&cstr, get_tok_str(t, &cval)); cstr_cat(&cstr, get_tok_str(t, &cval));
notfirst = 1; notfirst = 1;
} }
@ -3348,7 +3476,7 @@ static int *macro_arg_subst(Sym **nested_list, int *macro_str, Sym *args)
int t1; int t1;
add_var: add_var:
for(;;) { for(;;) {
t1 = tok_get(&st, &cval); TOK_GET(t1, st, cval);
if (!t1) if (!t1)
break; break;
tok_str_add2(&str, t1, &cval); tok_str_add2(&str, t1, &cval);
@ -3392,7 +3520,7 @@ static int *macro_twosharps(void)
macro_ptr1 = macro_ptr; macro_ptr1 = macro_ptr;
t = *macro_ptr; t = *macro_ptr;
if (t) { if (t) {
t = tok_get(&macro_ptr, &cval); TOK_GET(t, macro_ptr, cval);
/* We concatenate the two tokens if we have an /* We concatenate the two tokens if we have an
identifier or a preprocessing number */ identifier or a preprocessing number */