Add \a, \b, \t, \n \v, \f, \r and \e

2019-08-06 22:36:19 +09:00 · 2019-08-06 22:36:19 +09:00 · ad7749f2fa
parent 35a0bcd366
commit ad7749f2fa
2 changed files with 70 additions and 6 deletions
--- a/test.sh
+++ b/test.sh
@ -180,4 +180,22 @@ assert 99 'int main() { return "abc"[2]; }'
 assert 0 'int main() { return "abc"[3]; }'
 assert 4 'int main() { return sizeof("abc"); }'

+assert 7 'int main() { return "\a"[0]; }'
+assert 8 'int main() { return "\b"[0]; }'
+assert 9 'int main() { return "\t"[0]; }'
+assert 10 'int main() { return "\n"[0]; }'
+assert 11 'int main() { return "\v"[0]; }'
+assert 12 'int main() { return "\f"[0]; }'
+assert 13 'int main() { return "\r"[0]; }'
+assert 27 'int main() { return "\e"[0]; }'
+
+assert 106 'int main() { return "\j"[0]; }'
+assert 107 'int main() { return "\k"[0]; }'
+assert 108 'int main() { return "\l"[0]; }'
+
+assert 7 'int main() { return "\ax\ny"[0]; }'
+assert 120 'int main() { return "\ax\ny"[1]; }'
+assert 10 'int main() { return "\ax\ny"[2]; }'
+assert 121 'int main() { return "\ax\ny"[3]; }'
+
 echo OK
--- a/tokenize.c
+++ b/tokenize.c
@ -99,15 +99,61 @@ static bool is_keyword(Token *tok) {
  return false;
 }

-static Token *read_string_literal(char *start) {
-  char *p = start + 1;
-  for (; *p != '"'; p++)
+static int read_escaped_char(char *p) {
+  // Escape sequences are defined using themselves here. E.g.
+  // '\n' is implemented using '\n'. This tautological definition
+  // works because the compiler that compiles our compiler knows
+  // what '\n' actually is. In other words, we "inherit" the ASCII
+  // code of '\n' from the compiler that compiles our compiler,
+  // so we don't have to teach the actual code here.
+  //
+  // This fact has huge implications not only for the correctness
+  // of the compiler but also for the security of the generated code.
+  // For more info, read "Reflections on Trusting Trust" by Ken Thompson.
+  // https://github.com/rui314/chibicc/wiki/thompson1984.pdf
+  switch (*p) {
+  case 'a': return '\a';
+  case 'b': return '\b';
+  case 't': return '\t';
+  case 'n': return '\n';
+  case 'v': return '\v';
+  case 'f': return '\f';
+  case 'r': return '\r';
+  // [GNU] \e for the ASCII escape character is a GNU C extension.
+  case 'e': return 27;
+  default: return *p;
+  }
+}
+
+// Find a closing double-quote.
+static char *string_literal_end(char *p) {
+  char *start = p;
+  for (; *p != '"'; p++) {
    if (*p == '\n' || *p == '\0')
      error_at(start, "unclosed string literal");
+    if (*p == '\\')
+      p++;
+  }
+  return p;
+}

-  Token *tok = new_token(TK_STR, start, p + 1);
-  tok->ty = array_of(ty_char, p - start);
-  tok->str = strndup(start + 1, p - start - 1);
+static Token *read_string_literal(char *start) {
+  char *end = string_literal_end(start + 1);
+  char *buf = calloc(1, end - start);
+  int len = 0;
+
+  for (char *p = start + 1; p < end;) {
+    if (*p == '\\') {
+      buf[len++] = read_escaped_char(p + 1);
+      p += 2;
+    } else {
+      buf[len++] = *p++;
+    }
+  }
+
+  Token *tok = new_token(TK_STR, start, end + 1);
+  tok->ty = array_of(ty_char, len + 1);
+  tok->str = buf;
  return tok;
 }