chibicc/tokenize.c

#include "chibicc.h"

char *filename;
char *user_input;
Token *token;

// Reports an error and exit.
void error(char *fmt, ...) {
  va_list ap;
  va_start(ap, fmt);
  vfprintf(stderr, fmt, ap);
  fprintf(stderr, "\n");
  exit(1);
}

// Reports an error message in the following format and exit.
//
// foo.c:10: x = y + 1;
//               ^ <error message here>
void verror_at(char *loc, char *fmt, va_list ap) {
  // Find a line containing `loc`.
  char *line = loc;
  while (user_input < line && line[-1] != '\n')
    line--;

  char *end = loc;
  while (*end != '\n')
    end++;

  // Get a line number.
  int line_num = 1;
  for (char *p = user_input; p < line; p++)
    if (*p == '\n')
      line_num++;

  // Print out the line.
  int indent = fprintf(stderr, "%s:%d: ", filename, line_num);
  fprintf(stderr, "%.*s\n", (int)(end - line), line);

  // Show the error message.
  int pos = loc - line + indent;
  fprintf(stderr, "%*s", pos, ""); // print pos spaces.
  fprintf(stderr, "^ ");
  vfprintf(stderr, fmt, ap);
  fprintf(stderr, "\n");
}

// Reports an error location and exit.
void error_at(char *loc, char *fmt, ...) {
  va_list ap;
  va_start(ap, fmt);
  verror_at(loc, fmt, ap);
  exit(1);
}

// Reports an error location and exit.
void error_tok(Token *tok, char *fmt, ...) {
  va_list ap;
  va_start(ap, fmt);
  if (tok) {
    verror_at(tok->str, fmt, ap);
  } else {
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
  }
  exit(1);
}

void warn_tok(Token *tok, char *fmt, ...) {
  va_list ap;
  va_start(ap, fmt);
  if (tok) {
    verror_at(tok->str, fmt, ap);
  } else {
    vfprintf(stderr, fmt, ap);
    fprintf(stderr, "\n");
  }
}

char *strndup(char *p, int len) {
  char *buf = malloc(len + 1);
  strncpy(buf, p, len);
  buf[len] = '\0';
  return buf;
}

// Returns true if the current token matches a given string.
Token *peek(char *s) {
  if (token->kind != TK_RESERVED || strlen(s) != token->len ||
      memcmp(token->str, s, token->len))
    return NULL;
  return token;
}

// Consumes the current token if it matches a given string.
Token *consume(char *s) {
  if (!peek(s))
    return NULL;
  Token *t = token;
  token = token->next;
  return t;
}

// Consumes the current token if it is an identifier.
Token *consume_ident() {
  if (token->kind != TK_IDENT)
    return NULL;
  Token *t = token;
  token = token->next;
  return t;
}

// Ensure that the current token is a given string
void expect(char *s) {
  if (!peek(s))
    error_tok(token, "expected \"%s\"", s);
  token = token->next;
}

// Ensure that the current token is TK_NUM.
long expect_number() {
  if (token->kind != TK_NUM)
    error_tok(token, "expected a number");
  long val = token->val;
  token = token->next;
  return val;
}

// Ensure that the current token is TK_IDENT.
char *expect_ident() {
  if (token->kind != TK_IDENT)
    error_tok(token, "expected an identifier");
  char *s = strndup(token->str, token->len);
  token = token->next;
  return s;
}

bool at_eof() {
  return token->kind == TK_EOF;
}

// Create a new token and add it as the next token of `cur`.
Token *new_token(TokenKind kind, Token *cur, char *str, int len) {
  Token *tok = calloc(1, sizeof(Token));
  tok->kind = kind;
  tok->str = str;
  tok->len = len;
  cur->next = tok;
  return tok;
}

bool startswith(char *p, char *q) {
  return memcmp(p, q, strlen(q)) == 0;
}

bool is_alpha(char c) {
  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
}

bool is_alnum(char c) {
  return is_alpha(c) || ('0' <= c && c <= '9');
}

char *starts_with_reserved(char *p) {
  // Keyword
  static char *kw[] = {"return", "if", "else", "while", "for", "int",
                       "char", "sizeof", "struct", "typedef", "short",
                       "long", "void", "_Bool", "enum", "static", "break",
                       "continue", "goto", "switch", "case", "default"};

  for (int i = 0; i < sizeof(kw) / sizeof(*kw); i++) {
    int len = strlen(kw[i]);
    if (startswith(p, kw[i]) && !is_alnum(p[len]))
      return kw[i];
  }

  // Multi-letter punctuator
  static char *ops[] = {"<<=", ">>=", "==", "!=", "<=", ">=", "->",
                        "++", "--", "<<", ">>", "+=", "-=", "*=",
                        "/=", "&&", "||"};

  for (int i = 0; i < sizeof(ops) / sizeof(*ops); i++)
    if (startswith(p, ops[i]))
      return ops[i];

  return NULL;
}

char get_escape_char(char c) {
  switch (c) {
  case 'a': return '\a';
  case 'b': return '\b';
  case 't': return '\t';
  case 'n': return '\n';
  case 'v': return '\v';
  case 'f': return '\f';
  case 'r': return '\r';
  case 'e': return 27;
  case '0': return 0;
  default: return c;
  }
}

Token *read_string_literal(Token *cur, char *start) {
  char *p = start + 1;
  char buf[1024];
  int len = 0;

  for (;;) {
    if (len == sizeof(buf))
      error_at(start, "string literal too large");
    if (*p == '\0')
      error_at(start, "unclosed string literal");
    if (*p == '"')
      break;

    if (*p == '\\') {
      p++;
      buf[len++] = get_escape_char(*p++);
    } else {
      buf[len++] = *p++;
    }
  }

  Token *tok = new_token(TK_STR, cur, start, p - start + 1);
  tok->contents = malloc(len + 1);
  memcpy(tok->contents, buf, len);
  tok->contents[len] = '\0';
  tok->cont_len = len + 1;
  return tok;
}

Token *read_char_literal(Token *cur, char *start) {
  char *p = start + 1;
  if (*p == '\0')
    error_at(start, "unclosed char literal");

  char c;
  if (*p == '\\') {
    p++;
    c = get_escape_char(*p++);
  } else {
    c = *p++;
  }

  if (*p != '\'')
    error_at(start, "char literal too long");
  p++;

  Token *tok = new_token(TK_NUM, cur, start, p - start);
  tok->val = c;
  return tok;
}

// Tokenize `user_input` and returns new tokens.
Token *tokenize() {
  char *p = user_input;
  Token head;
  head.next = NULL;
  Token *cur = &head;

  while (*p) {
    // Skip whitespace characters.
    if (isspace(*p)) {
      p++;
      continue;
    }

    // Skip line comments.
    if (startswith(p, "//")) {
      p += 2;
      while (*p != '\n')
        p++;
      continue;
    }

    // Skip block comments.
    if (startswith(p, "/*")) {
      char *q = strstr(p + 2, "*/");
      if (!q)
        error_at(p, "unclosed block comment");
      p = q + 2;
      continue;
    }

    // Keyword or multi-letter punctuator
    char *kw = starts_with_reserved(p);
    if (kw) {
      int len = strlen(kw);
      cur = new_token(TK_RESERVED, cur, p, len);
      p += len;
      continue;
    }

    // Single-letter punctuator
    if (strchr("+-*/()<>;={},&[].,!~|^:?", *p)) {
      cur = new_token(TK_RESERVED, cur, p++, 1);
      continue;
    }

    // Identifier
    if (is_alpha(*p)) {
      char *q = p++;
      while (is_alnum(*p))
        p++;
      cur = new_token(TK_IDENT, cur, q, p - q);
      continue;
    }

    // String literal
    if (*p == '"') {
      cur = read_string_literal(cur, p);
      p += cur->len;
      continue;
    }

    // Character literal
    if (*p == '\'') {
      cur = read_char_literal(cur, p);
      p += cur->len;
      continue;
    }

    // Integer literal
    if (isdigit(*p)) {
      cur = new_token(TK_NUM, cur, p, 0);
      char *q = p;
      cur->val = strtol(p, &p, 10);
      cur->len = p - q;
      continue;
    }

    error_at(p, "invalid token");
  }

  new_token(TK_EOF, cur, p, 0);
  return head.next;
}
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`#include "chibicc.h"`

Read code from a file instead of argv[1] 2019-08-07 02:30:06 +03:00			`char *filename;`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`char *user_input;`
			`Token *token;`

			`// Reports an error and exit.`
			`void error(char *fmt, ...) {`
			`va_list ap;`
			`va_start(ap, fmt);`
			`vfprintf(stderr, fmt, ap);`
			`fprintf(stderr, "\n");`
			`exit(1);`
			`}`

Read code from a file instead of argv[1] 2019-08-07 02:30:06 +03:00			`// Reports an error message in the following format and exit.`
			`//`
			`// foo.c:10: x = y + 1;`
			`// ^ <error message here>`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`void verror_at(char loc, char fmt, va_list ap) {`
Read code from a file instead of argv[1] 2019-08-07 02:30:06 +03:00			// Find a line containing `loc`.
			`char *line = loc;`
			`while (user_input < line && line[-1] != '\n')`
			`line--;`

			`char *end = loc;`
			`while (*end != '\n')`
			`end++;`

			`// Get a line number.`
			`int line_num = 1;`
			`for (char *p = user_input; p < line; p++)`
			`if (*p == '\n')`
			`line_num++;`

			`// Print out the line.`
			`int indent = fprintf(stderr, "%s:%d: ", filename, line_num);`
			`fprintf(stderr, "%.*s\n", (int)(end - line), line);`

			`// Show the error message.`
			`int pos = loc - line + indent;`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`fprintf(stderr, "%*s", pos, ""); // print pos spaces.`
			`fprintf(stderr, "^ ");`
			`vfprintf(stderr, fmt, ap);`
			`fprintf(stderr, "\n");`
			`}`

Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`// Reports an error location and exit.`
			`void error_at(char loc, char fmt, ...) {`
			`va_list ap;`
			`va_start(ap, fmt);`
			`verror_at(loc, fmt, ap);`
Skip excess initializer elements 2019-08-19 02:35:11 +03:00			`exit(1);`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`}`

			`// Reports an error location and exit.`
			`void error_tok(Token tok, char fmt, ...) {`
			`va_list ap;`
			`va_start(ap, fmt);`
Skip excess initializer elements 2019-08-19 02:35:11 +03:00			`if (tok) {`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`verror_at(tok->str, fmt, ap);`
Skip excess initializer elements 2019-08-19 02:35:11 +03:00			`} else {`
			`vfprintf(stderr, fmt, ap);`
			`fprintf(stderr, "\n");`
			`}`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`exit(1);`
			`}`

Skip excess initializer elements 2019-08-19 02:35:11 +03:00			`void warn_tok(Token tok, char fmt, ...) {`
			`va_list ap;`
			`va_start(ap, fmt);`
			`if (tok) {`
			`verror_at(tok->str, fmt, ap);`
			`} else {`
			`vfprintf(stderr, fmt, ap);`
			`fprintf(stderr, "\n");`
			`}`
			`}`

Support multi-letter local variables 2019-08-04 09:21:44 +03:00			`char strndup(char p, int len) {`
			`char *buf = malloc(len + 1);`
			`strncpy(buf, p, len);`
			`buf[len] = '\0';`
			`return buf;`
			`}`

Add keyword "int" and make variable definition mandatory 2019-08-05 16:44:44 +03:00			`// Returns true if the current token matches a given string.`
			`Token peek(char s) {`
			`if (token->kind != TK_RESERVED \|\| strlen(s) != token->len \|\|`
			`memcmp(token->str, s, token->len))`
			`return NULL;`
			`return token;`
			`}`

			`// Consumes the current token if it matches a given string.`
			`Token consume(char s) {`
			`if (!peek(s))`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`return NULL;`
			`Token *t = token;`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`token = token->next;`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`return t;`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`}`

Support single-letter local variables 2019-08-03 13:05:07 +03:00			`// Consumes the current token if it is an identifier.`
			`Token *consume_ident() {`
			`if (token->kind != TK_IDENT)`
			`return NULL;`
			`Token *t = token;`
			`token = token->next;`
			`return t;`
			`}`

Add keyword "int" and make variable definition mandatory 2019-08-05 16:44:44 +03:00			`// Ensure that the current token is a given string`
			`void expect(char *s) {`
			`if (!peek(s))`
			`error_tok(token, "expected \"%s\"", s);`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`token = token->next;`
			`}`

			`// Ensure that the current token is TK_NUM.`
Fix large literal number's type If a literal number cannot be represented as an int, it should have type long. 2019-08-11 09:04:35 +03:00			`long expect_number() {`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`if (token->kind != TK_NUM)`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`error_tok(token, "expected a number");`
Fix large literal number's type If a literal number cannot be represented as an int, it should have type long. 2019-08-11 09:04:35 +03:00			`long val = token->val;`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`token = token->next;`
			`return val;`
			`}`

Support zero-arity function definition 2019-08-04 13:16:16 +03:00			`// Ensure that the current token is TK_IDENT.`
			`char *expect_ident() {`
			`if (token->kind != TK_IDENT)`
Add a representative node to each Node to improve error messages 2019-08-05 14:53:58 +03:00			`error_tok(token, "expected an identifier");`
Support zero-arity function definition 2019-08-04 13:16:16 +03:00			`char *s = strndup(token->str, token->len);`
			`token = token->next;`
			`return s;`
			`}`

Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`bool at_eof() {`
			`return token->kind == TK_EOF;`
			`}`

			// Create a new token and add it as the next token of `cur`.
			`Token new_token(TokenKind kind, Token cur, char *str, int len) {`
			`Token *tok = calloc(1, sizeof(Token));`
			`tok->kind = kind;`
			`tok->str = str;`
			`tok->len = len;`
			`cur->next = tok;`
			`return tok;`
			`}`

			`bool startswith(char p, char q) {`
			`return memcmp(p, q, strlen(q)) == 0;`
			`}`

Add "return" statement 2019-08-03 12:30:00 +03:00			`bool is_alpha(char c) {`
			`return ('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z') \|\| c == '_';`
			`}`

			`bool is_alnum(char c) {`
			`return is_alpha(c) \|\| ('0' <= c && c <= '9');`
			`}`

Add "if" statement 2019-08-04 11:12:59 +03:00			`char starts_with_reserved(char p) {`
			`// Keyword`
Add char type 2019-08-06 14:10:55 +03:00			`static char *kw[] = {"return", "if", "else", "while", "for", "int",`
Add short and long types 2019-08-09 04:29:42 +03:00			`"char", "sizeof", "struct", "typedef", "short",`
Add continue statement 2019-08-15 08:04:51 +03:00			`"long", "void", "_Bool", "enum", "static", "break",`
Add switch-case 2019-08-15 10:43:24 +03:00			`"continue", "goto", "switch", "case", "default"};`
Add "if" statement 2019-08-04 11:12:59 +03:00
			`for (int i = 0; i < sizeof(kw) / sizeof(*kw); i++) {`
			`int len = strlen(kw[i]);`
			`if (startswith(p, kw[i]) && !is_alnum(p[len]))`
			`return kw[i];`
			`}`

			`// Multi-letter punctuator`
Add <<, >>, <<= and >>= 2019-08-16 07:45:24 +03:00			`static char *ops[] = {"<<=", ">>=", "==", "!=", "<=", ">=", "->",`
			`"++", "--", "<<", ">>", "+=", "-=", "*=",`
			`"/=", "&&", "\|\|"};`
Add "if" statement 2019-08-04 11:12:59 +03:00
			`for (int i = 0; i < sizeof(ops) / sizeof(*ops); i++)`
			`if (startswith(p, ops[i]))`
			`return ops[i];`

			`return NULL;`
			`}`

Add \a, \b, \t, \n \v, \f, \r, \e and \0 2019-08-06 16:36:19 +03:00			`char get_escape_char(char c) {`
			`switch (c) {`
			`case 'a': return '\a';`
			`case 'b': return '\b';`
			`case 't': return '\t';`
			`case 'n': return '\n';`
			`case 'v': return '\v';`
			`case 'f': return '\f';`
			`case 'r': return '\r';`
			`case 'e': return 27;`
			`case '0': return 0;`
			`default: return c;`
			`}`
			`}`

			`Token read_string_literal(Token cur, char *start) {`
			`char *p = start + 1;`
			`char buf[1024];`
			`int len = 0;`

			`for (;;) {`
			`if (len == sizeof(buf))`
			`error_at(start, "string literal too large");`
			`if (*p == '\0')`
			`error_at(start, "unclosed string literal");`
			`if (*p == '"')`
			`break;`

			`if (*p == '\\') {`
			`p++;`
			`buf[len++] = get_escape_char(*p++);`
			`} else {`
			`buf[len++] = *p++;`
			`}`
			`}`

			`Token *tok = new_token(TK_STR, cur, start, p - start + 1);`
			`tok->contents = malloc(len + 1);`
			`memcpy(tok->contents, buf, len);`
			`tok->contents[len] = '\0';`
			`tok->cont_len = len + 1;`
			`return tok;`
			`}`

Add char literal 2019-08-11 10:06:14 +03:00			`Token read_char_literal(Token cur, char *start) {`
			`char *p = start + 1;`
			`if (*p == '\0')`
			`error_at(start, "unclosed char literal");`

			`char c;`
			`if (*p == '\\') {`
			`p++;`
			`c = get_escape_char(*p++);`
			`} else {`
			`c = *p++;`
			`}`

			`if (*p != '\'')`
			`error_at(start, "char literal too long");`
			`p++;`

			`Token *tok = new_token(TK_NUM, cur, start, p - start);`
			`tok->val = c;`
			`return tok;`
			`}`

Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			// Tokenize `user_input` and returns new tokens.
			`Token *tokenize() {`
			`char *p = user_input;`
			`Token head;`
			`head.next = NULL;`
			`Token *cur = &head;`

			`while (*p) {`
			`// Skip whitespace characters.`
			`if (isspace(*p)) {`
			`p++;`
			`continue;`
			`}`

Add line and block comments 2019-08-07 02:56:28 +03:00			`// Skip line comments.`
			`if (startswith(p, "//")) {`
			`p += 2;`
			`while (*p != '\n')`
			`p++;`
			`continue;`
			`}`

			`// Skip block comments.`
			`if (startswith(p, "/*")) {`
			`char q = strstr(p + 2, "/");`
			`if (!q)`
			`error_at(p, "unclosed block comment");`
			`p = q + 2;`
			`continue;`
			`}`

Add "if" statement 2019-08-04 11:12:59 +03:00			`// Keyword or multi-letter punctuator`
			`char *kw = starts_with_reserved(p);`
			`if (kw) {`
			`int len = strlen(kw);`
			`cur = new_token(TK_RESERVED, cur, p, len);`
			`p += len;`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`continue;`
			`}`

			`// Single-letter punctuator`
Add ?: operator 2019-08-17 04:27:35 +03:00			`if (strchr("+-/()<>;={},&[].,!~\|^:?", p)) {`
Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`cur = new_token(TK_RESERVED, cur, p++, 1);`
			`continue;`
			`}`

Support single-letter local variables 2019-08-03 13:05:07 +03:00			`// Identifier`
Support multi-letter local variables 2019-08-04 09:21:44 +03:00			`if (is_alpha(*p)) {`
			`char *q = p++;`
			`while (is_alnum(*p))`
			`p++;`
			`cur = new_token(TK_IDENT, cur, q, p - q);`
Support single-letter local variables 2019-08-03 13:05:07 +03:00			`continue;`
			`}`

Add string literal 2019-08-06 14:49:57 +03:00			`// String literal`
			`if (*p == '"') {`
Add \a, \b, \t, \n \v, \f, \r, \e and \0 2019-08-06 16:36:19 +03:00			`cur = read_string_literal(cur, p);`
			`p += cur->len;`
Add string literal 2019-08-06 14:49:57 +03:00			`continue;`
			`}`

Add char literal 2019-08-11 10:06:14 +03:00			`// Character literal`
			`if (*p == '\'') {`
			`cur = read_char_literal(cur, p);`
			`p += cur->len;`
			`continue;`
			`}`

Split main.c into multiple small files 2019-08-03 11:17:13 +03:00			`// Integer literal`
			`if (isdigit(*p)) {`
			`cur = new_token(TK_NUM, cur, p, 0);`
			`char *q = p;`
			`cur->val = strtol(p, &p, 10);`
			`cur->len = p - q;`
			`continue;`
			`}`

			`error_at(p, "invalid token");`
			`}`

			`new_token(TK_EOF, cur, p, 0);`
			`return head.next;`
			`}`