chibicc/preprocess.c

408 lines
10 KiB
C
Raw Normal View History

// This file implements the C preprocessor.
//
// The preprocessor takes a list of tokens as an input and returns a
// new list of tokens as an output.
//
// The preprocessing language is designed in such a way that that's
// guaranteed to stop even if there is a recursive macro.
// Informally speaking, a macro is applied only once for each token.
// That is, if a macro token T appears in a result of direct or
// indirect macro expansion of T, T won't be expanded any further.
// For example, if T is defined as U, and U is defined as T, then
// token T is expanded to U and then to T and the macro expansion
// stops at that point.
//
// To achieve the above behavior, we attach for each token a set of
// macro names from which the token is expanded. The set is called
// "hideset". Hideset is initially empty, and every time we expand a
// macro, the macro name is added to the resulting tokens' hidesets.
//
// The above macro expansion algorithm is explained in this document,
// which is used as a basis for the standard's wording:
// https://github.com/rui314/chibicc/wiki/cpp.algo.pdf
2020-08-18 03:41:59 +03:00
#include "chibicc.h"
2020-03-29 07:29:48 +03:00
typedef struct Macro Macro;
struct Macro {
Macro *next;
char *name;
2020-08-18 04:45:03 +03:00
bool is_objlike; // Object-like or function-like
2020-03-29 07:29:48 +03:00
Token *body;
2020-03-29 12:23:33 +03:00
bool deleted;
2020-03-29 07:29:48 +03:00
};
2020-08-20 13:36:36 +03:00
// `#if` can be nested, so we use a stack to manage nested `#if`s.
typedef struct CondIncl CondIncl;
struct CondIncl {
CondIncl *next;
2020-03-29 02:51:06 +03:00
enum { IN_THEN, IN_ELIF, IN_ELSE } ctx;
2020-08-20 13:36:36 +03:00
Token *tok;
2020-03-30 03:57:07 +03:00
bool included;
2020-08-20 13:36:36 +03:00
};
typedef struct Hideset Hideset;
struct Hideset {
Hideset *next;
char *name;
};
2020-03-29 07:29:48 +03:00
static Macro *macros;
2020-08-20 13:36:36 +03:00
static CondIncl *cond_incl;
static Token *preprocess2(Token *tok);
2020-03-30 03:30:06 +03:00
static bool is_hash(Token *tok) {
return tok->at_bol && equal(tok, "#");
}
// Some preprocessor directives such as #include allow extraneous
// tokens before newline. This function skips such tokens.
static Token *skip_line(Token *tok) {
if (tok->at_bol)
return tok;
warn_tok(tok, "extra token");
while (tok->at_bol)
tok = tok->next;
return tok;
}
2020-09-03 13:24:23 +03:00
static Token *copy_token(Token *tok) {
Token *t = calloc(1, sizeof(Token));
*t = *tok;
t->next = NULL;
return t;
}
2020-08-20 13:36:36 +03:00
static Token *new_eof(Token *tok) {
Token *t = copy_token(tok);
t->kind = TK_EOF;
t->len = 0;
return t;
}
static Hideset *new_hideset(char *name) {
Hideset *hs = calloc(1, sizeof(Hideset));
hs->name = name;
return hs;
}
static Hideset *hideset_union(Hideset *hs1, Hideset *hs2) {
Hideset head = {};
Hideset *cur = &head;
for (; hs1; hs1 = hs1->next)
cur = cur->next = new_hideset(hs1->name);
cur->next = hs2;
return head.next;
}
static bool hideset_contains(Hideset *hs, char *s, int len) {
for (; hs; hs = hs->next)
if (strlen(hs->name) == len && !strncmp(hs->name, s, len))
return true;
return false;
}
static Token *add_hideset(Token *tok, Hideset *hs) {
Token head = {};
Token *cur = &head;
for (; tok; tok = tok->next) {
Token *t = copy_token(tok);
t->hideset = hideset_union(t->hideset, hs);
cur = cur->next = t;
}
return head.next;
}
2020-09-03 13:24:23 +03:00
// Append tok2 to the end of tok1.
static Token *append(Token *tok1, Token *tok2) {
2020-03-29 07:29:48 +03:00
if (tok1->kind == TK_EOF)
2020-09-03 13:24:23 +03:00
return tok2;
Token head = {};
Token *cur = &head;
2020-03-29 07:29:48 +03:00
for (; tok1->kind != TK_EOF; tok1 = tok1->next)
2020-09-03 13:24:23 +03:00
cur = cur->next = copy_token(tok1);
cur->next = tok2;
return head.next;
}
2020-03-30 03:57:07 +03:00
static Token *skip_cond_incl2(Token *tok) {
while (tok->kind != TK_EOF) {
2020-03-29 11:18:31 +03:00
if (is_hash(tok) &&
(equal(tok->next, "if") || equal(tok->next, "ifdef") ||
equal(tok->next, "ifndef"))) {
2020-03-30 03:57:07 +03:00
tok = skip_cond_incl2(tok->next->next);
continue;
}
if (is_hash(tok) && equal(tok->next, "endif"))
return tok->next->next;
tok = tok->next;
}
return tok;
}
2020-03-29 02:51:06 +03:00
// Skip until next `#else`, `#elif` or `#endif`.
// Nested `#if` and `#endif` are skipped.
2020-08-20 13:36:36 +03:00
static Token *skip_cond_incl(Token *tok) {
while (tok->kind != TK_EOF) {
2020-03-29 11:18:31 +03:00
if (is_hash(tok) &&
(equal(tok->next, "if") || equal(tok->next, "ifdef") ||
equal(tok->next, "ifndef"))) {
2020-03-30 03:57:07 +03:00
tok = skip_cond_incl2(tok->next->next);
continue;
}
2020-03-30 03:57:07 +03:00
if (is_hash(tok) &&
2020-03-29 02:51:06 +03:00
(equal(tok->next, "elif") || equal(tok->next, "else") ||
equal(tok->next, "endif")))
break;
2020-08-20 13:36:36 +03:00
tok = tok->next;
}
return tok;
}
// Copy all tokens until the next newline, terminate them with
// an EOF token and then returns them. This function is used to
// create a new list of tokens for `#if` arguments.
static Token *copy_line(Token **rest, Token *tok) {
Token head = {};
Token *cur = &head;
for (; !tok->at_bol; tok = tok->next)
cur = cur->next = copy_token(tok);
cur->next = new_eof(tok);
*rest = tok;
return head.next;
}
// Read and evaluate a constant expression.
static long eval_const_expr(Token **rest, Token *tok) {
Token *start = tok;
Token *expr = copy_line(rest, tok->next);
expr = preprocess2(expr);
2020-08-20 13:36:36 +03:00
if (expr->kind == TK_EOF)
error_tok(start, "no expression");
Token *rest2;
long val = const_expr(&rest2, expr);
if (rest2->kind != TK_EOF)
error_tok(rest2, "extra token");
return val;
}
2020-03-30 03:57:07 +03:00
static CondIncl *push_cond_incl(Token *tok, bool included) {
2020-08-20 13:36:36 +03:00
CondIncl *ci = calloc(1, sizeof(CondIncl));
ci->next = cond_incl;
2020-03-30 03:57:07 +03:00
ci->ctx = IN_THEN;
2020-08-20 13:36:36 +03:00
ci->tok = tok;
2020-03-30 03:57:07 +03:00
ci->included = included;
2020-08-20 13:36:36 +03:00
cond_incl = ci;
return ci;
}
2020-03-29 07:29:48 +03:00
static Macro *find_macro(Token *tok) {
if (tok->kind != TK_IDENT)
return NULL;
for (Macro *m = macros; m; m = m->next)
if (strlen(m->name) == tok->len && !strncmp(m->name, tok->loc, tok->len))
2020-03-29 12:23:33 +03:00
return m->deleted ? NULL : m;
2020-03-29 07:29:48 +03:00
return NULL;
}
2020-08-18 04:45:03 +03:00
static Macro *add_macro(char *name, bool is_objlike, Token *body) {
2020-03-29 07:29:48 +03:00
Macro *m = calloc(1, sizeof(Macro));
m->next = macros;
m->name = name;
2020-08-18 04:45:03 +03:00
m->is_objlike = is_objlike;
2020-03-29 07:29:48 +03:00
m->body = body;
macros = m;
return m;
}
2020-08-18 04:45:03 +03:00
static void read_macro_definition(Token **rest, Token *tok) {
if (tok->kind != TK_IDENT)
error_tok(tok, "macro name must be an identifier");
char *name = strndup(tok->loc, tok->len);
tok = tok->next;
if (!tok->has_space && equal(tok, "(")) {
// Function-like macro
tok = skip(tok->next, ")");
add_macro(name, false, copy_line(rest, tok));
} else {
// Object-like macro
add_macro(name, true, copy_line(rest, tok));
}
}
2020-03-29 07:29:48 +03:00
// If tok is a macro, expand it and return true.
// Otherwise, do nothing and return false.
static bool expand_macro(Token **rest, Token *tok) {
if (hideset_contains(tok->hideset, tok->loc, tok->len))
return false;
2020-03-29 07:29:48 +03:00
Macro *m = find_macro(tok);
if (!m)
return false;
2020-08-18 04:45:03 +03:00
// Object-like macro application
if (m->is_objlike) {
Hideset *hs = hideset_union(tok->hideset, new_hideset(m->name));
Token *body = add_hideset(m->body, hs);
*rest = append(body, tok->next);
return true;
}
// If a funclike macro token is not followed by an argument list,
// treat it as a normal identifier.
if (!equal(tok->next, "("))
return false;
// Function-like macro application
tok = skip(tok->next->next, ")");
*rest = append(m->body, tok);
2020-03-29 07:29:48 +03:00
return true;
}
2020-03-30 03:30:06 +03:00
// Visit all tokens in `tok` while evaluating preprocessing
// macros and directives.
static Token *preprocess2(Token *tok) {
Token head = {};
Token *cur = &head;
while (tok->kind != TK_EOF) {
2020-03-29 07:29:48 +03:00
// If it is a macro, expand it.
if (expand_macro(&tok, tok))
continue;
2020-03-30 03:30:06 +03:00
// Pass through if it is not a "#".
if (!is_hash(tok)) {
cur = cur->next = tok;
tok = tok->next;
continue;
}
2020-08-20 13:36:36 +03:00
Token *start = tok;
2020-03-30 03:30:06 +03:00
tok = tok->next;
2020-09-03 13:24:23 +03:00
if (equal(tok, "include")) {
tok = tok->next;
if (tok->kind != TK_STR)
error_tok(tok, "expected a filename");
2020-08-20 15:37:02 +03:00
char *path;
if (tok->str[0] == '/')
path = tok->str;
else
path = format("%s/%s", dirname(strdup(tok->file->name)), tok->str);
2020-09-03 13:24:23 +03:00
Token *tok2 = tokenize_file(path);
if (!tok2)
error_tok(tok, "%s", strerror(errno));
tok = skip_line(tok->next);
tok = append(tok2, tok);
2020-09-03 13:24:23 +03:00
continue;
}
2020-03-29 07:29:48 +03:00
if (equal(tok, "define")) {
2020-08-18 04:45:03 +03:00
read_macro_definition(&tok, tok->next);
2020-03-29 07:29:48 +03:00
continue;
}
2020-03-29 12:23:33 +03:00
if (equal(tok, "undef")) {
tok = tok->next;
if (tok->kind != TK_IDENT)
error_tok(tok, "macro name must be an identifier");
char *name = strndup(tok->loc, tok->len);
tok = skip_line(tok->next);
2020-08-18 04:45:03 +03:00
Macro *m = add_macro(name, true, NULL);
2020-03-29 12:23:33 +03:00
m->deleted = true;
continue;
}
2020-08-20 13:36:36 +03:00
if (equal(tok, "if")) {
long val = eval_const_expr(&tok, tok);
2020-03-30 03:57:07 +03:00
push_cond_incl(start, val);
2020-08-20 13:36:36 +03:00
if (!val)
tok = skip_cond_incl(tok);
continue;
}
2020-03-29 11:18:31 +03:00
if (equal(tok, "ifdef")) {
bool defined = find_macro(tok->next);
push_cond_incl(tok, defined);
tok = skip_line(tok->next->next);
if (!defined)
tok = skip_cond_incl(tok);
continue;
}
if (equal(tok, "ifndef")) {
bool defined = find_macro(tok->next);
push_cond_incl(tok, !defined);
tok = skip_line(tok->next->next);
if (defined)
tok = skip_cond_incl(tok);
continue;
}
2020-03-29 02:51:06 +03:00
if (equal(tok, "elif")) {
if (!cond_incl || cond_incl->ctx == IN_ELSE)
error_tok(start, "stray #elif");
cond_incl->ctx = IN_ELIF;
if (!cond_incl->included && eval_const_expr(&tok, tok))
cond_incl->included = true;
else
tok = skip_cond_incl(tok);
continue;
}
2020-03-30 03:57:07 +03:00
if (equal(tok, "else")) {
if (!cond_incl || cond_incl->ctx == IN_ELSE)
error_tok(start, "stray #else");
cond_incl->ctx = IN_ELSE;
tok = skip_line(tok->next);
if (cond_incl->included)
tok = skip_cond_incl(tok);
continue;
}
2020-08-20 13:36:36 +03:00
if (equal(tok, "endif")) {
if (!cond_incl)
error_tok(start, "stray #endif");
cond_incl = cond_incl->next;
tok = skip_line(tok->next);
continue;
}
2020-03-30 03:30:06 +03:00
// `#`-only line is legal. It's called a null directive.
if (tok->at_bol)
continue;
error_tok(tok, "invalid preprocessor directive");
}
cur->next = tok;
return head.next;
}
2020-08-18 03:41:59 +03:00
// Entry point function of the preprocessor.
Token *preprocess(Token *tok) {
2020-03-30 03:30:06 +03:00
tok = preprocess2(tok);
2020-08-20 13:36:36 +03:00
if (cond_incl)
error_tok(cond_incl->tok, "unterminated conditional directive");
2020-08-18 03:41:59 +03:00
convert_keywords(tok);
return tok;
}