Add UTF-16 string literal

2025-02-18 06:34:05 +03:00 · 2020-09-26 19:56:56 +09:00 · 2020-09-26 19:56:56 +09:00 · 9cabe1f204
commit 9cabe1f204
parent 57b21fe902
2 changed files with 55 additions and 0 deletions
--- a/test/unicode.c
+++ b/test/unicode.c
@ -39,6 +39,18 @@ int main() {

  ASSERT(0, strcmp(STR(u8"a"), "u8\"a\""));

+  ASSERT(2, sizeof(u""));
+  ASSERT(10, sizeof(u"\xffzzz"));
+  ASSERT(0, memcmp(u"", "\0\0", 2));
+  ASSERT(0, memcmp(u"abc", "a\0b\0c\0\0\0", 8));
+  ASSERT(0, memcmp(u"日本語", "\345e,g\236\212\0\0", 8));
+  ASSERT(0, memcmp(u"🍣", "<\330c\337\0\0", 6));
+  ASSERT(u'β', u"βb"[0]);
+  ASSERT(u'b', u"βb"[1]);
+  ASSERT(0, u"βb"[2]);
+
+  ASSERT(0, strcmp(STR(u"a"), "u\"a\""));
+
  printf("OK\n");
  return 0;
 }
--- a/tokenize.c
+++ b/tokenize.c
@ -246,6 +246,42 @@ static Token *read_string_literal(char *start, char *quote) {
  return tok;
 }

+// Read a UTF-8-encoded string literal and transcode it in UTF-16.
+//
+// UTF-16 is yet another variable-width encoding for Unicode. Code
+// points smaller than U+10000 are encoded in 2 bytes. Code points
+// equal to or larger than that are encoded in 4 bytes. Each 2 bytes
+// in the 4 byte sequence is called "surrogate", and a 4 byte sequence
+// is called a "surrogate pair".
+static Token *read_utf16_string_literal(char *start, char *quote) {
+  char *end = string_literal_end(quote + 1);
+  uint16_t *buf = calloc(2, end - start);
+  int len = 0;
+
+  for (char *p = quote + 1; p < end;) {
+    if (*p == '\\') {
+      buf[len++] = read_escaped_char(&p, p + 1);
+      continue;
+    }
+
+    uint32_t c = decode_utf8(&p, p);
+    if (c < 0x10000) {
+      // Encode a code point in 2 bytes.
+      buf[len++] = c;
+    } else {
+      // Encode a code point in 4 bytes.
+      c -= 0x10000;
+      buf[len++] = 0xd800 + ((c >> 10) & 0x3ff);
+      buf[len++] = 0xdc00 + (c & 0x3ff);
+    }
+  }
+
+  Token *tok = new_token(TK_STR, start, end + 1);
+  tok->ty = array_of(ty_ushort, len + 1);
+  tok->str = (char *)buf;
+  return tok;
+}
+
 static Token *read_char_literal(char *start, char *quote, Type *ty) {
  char *p = quote + 1;
  if (*p == '\0')
@ -479,6 +515,13 @@ Token *tokenize(File *file) {
      continue;
    }

+    // UTF-16 string literal
+    if (startswith(p, "u\"")) {
+      cur = cur->next = read_utf16_string_literal(p, p + 1);
+      p += cur->len;
+      continue;
+    }
+
    // Character literal
    if (*p == '\'') {
      cur = cur->next = read_char_literal(p, p, ty_int);