Allow to concatenate regular string literals with L/u/U string literals

2024-11-22 06:11:18 +03:00 · 2020-10-04 14:58:00 +09:00 · 2020-10-04 14:58:00 +09:00 · 238277714d
commit 238277714d
parent adb8b98889
4 changed files with 88 additions and 2 deletions
--- a/chibicc.h
+++ b/chibicc.h
@ -91,6 +91,7 @@ bool consume(Token **rest, Token *tok, char *str);
 void convert_pp_tokens(Token *tok);
 File **get_input_files(void);
 File *new_file(char *name, int file_no, char *contents);
+Token *tokenize_string_literal(Token *tok, Type *basety);
 Token *tokenize(File *file);
 Token *tokenize_file(char *filename);

--- a/preprocess.c
+++ b/preprocess.c
@ -933,10 +933,59 @@ void init_macros(void) {
  define_macro("__TIME__", format_time(tm));
 }

+typedef enum {
+  STR_NONE, STR_UTF8, STR_UTF16, STR_UTF32, STR_WIDE,
+} StringKind;
+
+static StringKind getStringKind(Token *tok) {
+  if (!strcmp(tok->loc, "u8"))
+    return STR_UTF8;
+
+  switch (tok->loc[0]) {
+  case '"': return STR_NONE;
+  case 'u': return STR_UTF16;
+  case 'U': return STR_UTF32;
+  case 'L': return STR_WIDE;
+  }
+  unreachable();
+}
+
 // Concatenate adjacent string literals into a single string literal
 // as per the C spec.
-static void join_adjacent_string_literals(Token *tok1) {
-  while (tok1->kind != TK_EOF) {
+static void join_adjacent_string_literals(Token *tok) {
+  // First pass: If regular string literals are adjacent to wide
+  // string literals, regular string literals are converted to a wide
+  // type before concatenation. In this pass, we do the conversion.
+  for (Token *tok1 = tok; tok1->kind != TK_EOF;) {
+    if (tok1->kind != TK_STR || tok1->next->kind != TK_STR) {
+      tok1 = tok1->next;
+      continue;
+    }
+
+    StringKind kind = getStringKind(tok1);
+    Type *basety = tok1->ty->base;
+
+    for (Token *t = tok1->next; t->kind == TK_STR; t = t->next) {
+      StringKind k = getStringKind(t);
+      if (kind == STR_NONE) {
+        kind = k;
+        basety = t->ty->base;
+      } else if (k != STR_NONE && kind != k) {
+        error_tok(t, "unsupported non-standard concatenation of string literals");
+      }
+    }
+
+    if (basety->size > 1)
+      for (Token *t = tok1; t->kind == TK_STR; t = t->next)
+        if (t->ty->base->size == 1)
+          *t = *tokenize_string_literal(t, basety);
+
+    while (tok1->kind == TK_STR)
+      tok1 = tok1->next;
+  }
+
+  // Second pass: concatenate adjacent string literals.
+  for (Token *tok1 = tok; tok1->kind != TK_EOF;) {
    if (tok1->kind != TK_STR || tok1->next->kind != TK_STR) {
      tok1 = tok1->next;
      continue;
--- a/test/string.c
+++ b/test/string.c
@ -41,6 +41,32 @@ int main() {
  ASSERT(0, !strcmp("abc" "d", "abcd\nefgh"));
  ASSERT(0, strcmp("\x9" "0", "\t0"));

+  ASSERT(16, sizeof(L"abc" ""));
+
+  ASSERT(28, sizeof(L"abc" "def"));
+  ASSERT(28, sizeof(L"abc" L"def"));
+  ASSERT(14, sizeof(u"abc" "def"));
+  ASSERT(14, sizeof(u"abc" u"def"));
+
+  ASSERT(L'a', (L"abc" "def")[0]);
+  ASSERT(L'd', (L"abc" "def")[3]);
+  ASSERT(L'\0', (L"abc" "def")[6]);
+
+  ASSERT(u'a', (u"abc" "def")[0]);
+  ASSERT(u'd', (u"abc" "def")[3]);
+  ASSERT(u'\0', (u"abc" "def")[6]);
+
+  ASSERT(L'あ', ("あ" L"")[0]);
+  ASSERT(0343, ("\343\201\202" L"")[0]);
+  ASSERT(0201, ("\343\201\202" L"")[1]);
+  ASSERT(0202, ("\343\201\202" L"")[2]);
+  ASSERT(0, ("\343\201\202" L"")[3]);
+
+  ASSERT(L'a', ("a" "b" L"c")[0]);
+  ASSERT(L'b', ("a" "b" L"c")[1]);
+  ASSERT(L'c', ("a" "b" L"c")[2]);
+  ASSERT(0, ("a" "b" L"c")[3]);
+
  printf("OK\n");
  return 0;
 }
--- a/tokenize.c
+++ b/tokenize.c
@ -469,6 +469,16 @@ static void add_line_numbers(Token *tok) {
  } while (*p++);
 }

+Token *tokenize_string_literal(Token *tok, Type *basety) {
+  Token *t;
+  if (basety->size == 2)
+    t = read_utf16_string_literal(tok->loc, tok->loc);
+  else
+    t = read_utf32_string_literal(tok->loc, tok->loc, basety);
+  t->next = tok->next;
+  return t;
+}
+
 // Tokenize a given string and returns new tokens.
 Token *tokenize(File *file) {
  current_file = file;